arm_compute v18.02 Change-Id: I7207aa488e5470f235f39b6c188b4678dc38d1a6

commit: 06ea048f062a50404b1b3998a61a45449c2d1f0f [log] [tgz]
author: Anthony Barbier <anthony.barbier@arm.com> Thu Feb 22 15:45:35 2018 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> Fri Feb 23 11:49:54 2018 +0000
tree: aa0dea3b0c49422538df9a5a02578b2c29e6fa67
parent: 292227986edb37b01061afcad6df18ba9d6ccbeb [diff]
diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp
index b75ebcf..74af99b 100644
--- a/src/core/AccessWindowAutoPadding.cpp
+++ b/src/core/AccessWindowAutoPadding.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,7 +70,7 @@
     return false;
 }
 
-bool AccessWindowAutoPadding::update_padding_if_needed(const Window &window) const
+bool AccessWindowAutoPadding::update_padding_if_needed(const Window &window)
 {
     ARM_COMPUTE_UNUSED(window);
 

diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
index 81ad60b..2ddd59e 100644
--- a/src/core/AccessWindowStatic.cpp
+++ b/src/core/AccessWindowStatic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -163,7 +163,7 @@
     return window_modified;
 }
 
-bool AccessWindowStatic::update_padding_if_needed(const Window &window) const
+bool AccessWindowStatic::update_padding_if_needed(const Window &window)
 {
     ARM_COMPUTE_UNUSED(window);
 

diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index 4506a0b..3c45ab3 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -180,7 +180,7 @@
     return window_modified;
 }
 
-bool AccessWindowTranspose::update_padding_if_needed(const Window &window) const
+bool AccessWindowTranspose::update_padding_if_needed(const Window &window)
 {
     // Only update the padding if the tensor allows it
     if(_info == nullptr || !_info->is_resizable())

diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 22a328b..c7c08d4 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp

@@ -151,6 +151,7 @@
     { "activation_layer_qa8", "activation_layer_qa8.cl" },
     { "arithmetic_add", "arithmetic_op.cl" },
     { "arithmetic_sub", "arithmetic_op.cl" },
+    { "batchnormalization_layer", "batchnormalization_layer.cl" },
     { "bitwise_or", "bitwise_op.cl" },
     { "bitwise_and", "bitwise_op.cl" },
     { "bitwise_xor", "bitwise_op.cl" },
@@ -170,7 +171,7 @@
     { "combine_gradients_L2", "canny.cl" },
     { "concatenate_depth", "concatenate.cl" },
     { "convolution_rectangle", "convolution_rectangle.cl" },
-    { "col2im", "convolution_layer.cl" },
+    { "col2im", "col2im.cl" },
     { "convolution3x3_static", "convolution3x3.cl" },
     { "convolution5x5_static", "convolution5x5.cl" },
     { "convolution7x7_static", "convolution7x7.cl" },
@@ -188,7 +189,10 @@
     { "copy_to_keypoint", "fast_corners.cl" },
     { "deconvolution_upsample", "deconvolution_layer.cl" },
     { "depthwise_convolution_3x3", "depthwise_convolution.cl" },
+    { "depthwise_convolution_3x3_f16", "depthwise_convolution.cl" },
     { "depthwise_convolution_3x3_quantized", "depthwise_convolution_quantized.cl" },
+    { "depthwise_convolution_3x3_stridex1_stridey1_bifrost", "depthwise_convolution.cl" },
+    { "depthwise_convolution_3x3_stridex2_stridey2_bifrost", "depthwise_convolution.cl" },
     { "depthwise_im2col", "depthwise_convolution.cl" },
     { "depthwise_vector_to_tensor", "depthwise_convolution.cl" },
     { "depthwise_weights_reshape", "depthwise_convolution.cl" },
@@ -211,14 +215,13 @@
     { "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
     { "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
     { "gemm_accumulate_biases", "gemm.cl" },
-    { "gemm_interleave4x4_8bit", "gemm.cl" },
-    { "gemm_interleave4x4_16bit", "gemm.cl" },
-    { "gemm_interleave4x4_32bit", "gemm.cl" },
+    { "gemm_interleave4x4", "gemm.cl" },
     { "gemm_ma_f16", "gemm.cl" },
     { "gemm_ma_f32", "gemm.cl" },
     { "gemm_ma_qs8", "gemm.cl" },
     { "gemm_ma_qs16", "gemm.cl" },
     { "gemm_mv", "gemv.cl" },
+    { "gemm_mv_quantized", "gemv.cl" },
     { "gemm_mm_interleaved_transposed_f16", "gemm.cl" },
     { "gemm_mm_interleaved_transposed_f32_midgard", "gemm.cl" },
     { "gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl" },
@@ -230,13 +233,13 @@
     { "gemm_mm_qs8", "gemm.cl" },
     { "gemm_mm_qs16", "gemm.cl" },
     { "gemm_lc_vm_f32", "gemm.cl" },
-    { "gemm_transpose1x16", "gemm.cl" },
-    { "gemm_transpose1x8", "gemm.cl" },
-    { "gemm_transpose1x4", "gemm.cl" },
+    { "gemm_transpose1xW", "gemm.cl" },
     { "gemmlowp_matrix_a_reduction", "gemmlowp.cl" },
     { "gemmlowp_matrix_b_reduction", "gemmlowp.cl" },
-    { "gemmlowp_mm", "gemmlowp.cl" },
-    { "gemmlowp_mm_interleaved_transposed", "gemmlowp.cl" },
+    { "gemmlowp_mm_bifrost", "gemmlowp.cl" },
+    { "gemmlowp_mm_midgard", "gemmlowp.cl" },
+    { "gemmlowp_mm_interleaved_transposed_bifrost", "gemmlowp.cl" },
+    { "gemmlowp_mm_interleaved_transposed_midgard", "gemmlowp.cl" },
     { "gemmlowp_offset_contribution", "gemmlowp.cl" },
     { "gemmlowp_output_stage_quantize_down", "gemmlowp.cl" },
     { "gemmlowp_output_stage_quantize_down_fixedpoint", "gemmlowp.cl" },
@@ -251,10 +254,13 @@
     { "hog_detector", "hog.cl" },
     { "hog_orientation_binning", "hog.cl" },
     { "hysteresis", "canny.cl" },
-    { "im2col_generic", "convolution_layer.cl" },
-    { "im2col_generic_padx0_pady0", "convolution_layer.cl" },
-    { "im2col_kernel3x3_padx0_pady0", "convolution_layer.cl" },
-    { "im2col_reduced", "convolution_layer.cl" },
+    { "im2col1x1_stridex1_dchw", "im2col.cl" },
+    { "im2col3x3_dchw", "im2col.cl" },
+    { "im2col5x5_dchw", "im2col.cl" },
+    { "im2col11x11_padx0_pady0_dchw", "im2col.cl" },
+    { "im2col_generic_dchw", "im2col.cl" },
+    { "im2col_generic_padx0_pady0_dchw", "im2col.cl" },
+    { "im2col_reduced_dchw", "im2col.cl" },
     { "init_level", "optical_flow_pyramid_lk.cl" },
     { "init_level_max", "optical_flow_pyramid_lk.cl" },
     { "init_level_max_initial_estimate", "optical_flow_pyramid_lk.cl" },
@@ -282,7 +288,6 @@
     { "non_max_suppression", "nonmax.cl" },
     { "normalization_layer_cross_map", "normalization_layer.cl" },
     { "normalization_layer_in_map", "normalization_layer.cl" },
-    { "batchnormalization_layer", "batchnormalization_layer.cl" },
     { "NV12_to_IYUV_bt709", "color_convert.cl" },
     { "NV12_to_RGB888_bt709", "color_convert.cl" },
     { "NV12_to_RGBA8888_bt709", "color_convert.cl" },
@@ -291,6 +296,7 @@
     { "NV21_to_RGB888_bt709", "color_convert.cl" },
     { "NV21_to_RGBA8888_bt709", "color_convert.cl" },
     { "NV21_to_YUV444_bt709", "color_convert.cl" },
+    { "output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl" },
     { "permute_201", "permute.cl" },
     { "permute_120", "permute.cl" },
     { "permute_3201", "permute.cl" },
@@ -300,8 +306,8 @@
     { "pooling_layer_3", "pooling_layer.cl" },
     { "pooling_layer_optimized_3", "pooling_layer.cl" },
     { "pooling_layer_7", "pooling_layer.cl" },
-    { "pooling_layer_N", "pooling_layer.cl" },
-    { "pooling_layer_N_quantized", "pooling_layer_quantized.cl" },
+    { "pooling_layer_MxN", "pooling_layer.cl" },
+    { "pooling_layer_MxN_quantized", "pooling_layer_quantized.cl" },
     { "quantization_layer", "quantization_layer.cl" },
     { "reduction_operation", "reduction_operation.cl" },
     { "remap_nearest_neighbour", "remap.cl" },
@@ -325,12 +331,10 @@
     { "sobel_separable1x5", "sobel_filter.cl" },
     { "sobel_separable7x1", "sobel_filter.cl" },
     { "sobel_separable1x7", "sobel_filter.cl" },
-    { "softmax_layer_max", "softmax_layer.cl" },
-    { "softmax_layer_max_quantized", "softmax_layer_quantized.cl" },
-    { "softmax_layer_shift_exp_sum", "softmax_layer.cl" },
-    { "softmax_layer_shift_exp_sum_quantized", "softmax_layer_quantized.cl" },
     { "softmax_layer_norm", "softmax_layer.cl" },
     { "softmax_layer_norm_quantized", "softmax_layer_quantized.cl" },
+    { "softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl" },
+    { "softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl" },
     { "softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl" },
     { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" },
     { "suppress_non_maximum", "canny.cl" },
@@ -393,6 +397,10 @@
 #include "./cl_kernels/channel_extract.clembed"
     },
     {
+        "col2im.cl",
+#include "./cl_kernels/col2im.clembed"
+    },
+    {
         "concatenate.cl",
 #include "./cl_kernels/concatenate.clembed"
     },
@@ -525,6 +533,10 @@
 #include "./cl_kernels/hog.clembed"
     },
     {
+        "im2col.cl",
+#include "./cl_kernels/im2col.clembed"
+    },
+    {
         "integral_image.cl",
 #include "./cl_kernels/integral_image.clembed"
     },

diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 7da7438..491e0c4 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp

@@ -43,10 +43,11 @@
         return;
     }
 
-    // Make sure that dimensions > Z are 1
-    for(unsigned int i = 3; i < Coordinates::num_max_dimensions; ++i)
+    for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
-        ARM_COMPUTE_ERROR_ON((window[i].end() - window[i].start()) != 1);
+        ARM_COMPUTE_ERROR_ON(window[i].step() == 0);
+        // Make sure that dimensions > Z are 1
+        ARM_COMPUTE_ERROR_ON((i >= 3) && ((window[i].end() - window[i].start()) != 1));
     }
 
     cl::NDRange gws = ICLKernel::gws_from_window(window);
@@ -77,16 +78,6 @@
     queue.enqueueNDRangeKernel(kernel.kernel(), cl::NullRange, gws, lws);
 }
 
-ICLKernel::ICLKernel()
-    : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0)
-{
-}
-
-cl::Kernel &ICLKernel::kernel()
-{
-    return _kernel;
-}
-
 template <unsigned int dimension_size>
 void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, const Window &window)
 {
@@ -106,10 +97,10 @@
     unsigned int idx_start = idx;
     _kernel.setArg(idx++, tensor->cl_buffer());
 
-    for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
+    for(unsigned int d = 0; d < dimension_size; ++d)
     {
-        _kernel.setArg<cl_uint>(idx++, strides[dimension]);
-        _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
+        _kernel.setArg<cl_uint>(idx++, strides[d]);
+        _kernel.setArg<cl_uint>(idx++, strides[d] * window[d].step());
     }
 
     _kernel.setArg<cl_uint>(idx++, offset_first_element);
@@ -119,66 +110,16 @@
     ARM_COMPUTE_UNUSED(idx_start);
 }
 
-void ICLKernel::add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
-{
-    add_tensor_argument<1>(idx, tensor, window);
-}
-
-void ICLKernel::add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
-{
-    add_tensor_argument<2>(idx, tensor, window);
-}
-
-void ICLKernel::add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
-{
-    add_tensor_argument<3>(idx, tensor, window);
-}
-
-void ICLKernel::add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
-{
-    add_tensor_argument<4>(idx, tensor, window);
-}
-
-unsigned int ICLKernel::num_arguments_per_1D_array() const
-{
-    return num_arguments_per_array<1>();
-}
-
-unsigned int ICLKernel::num_arguments_per_1D_tensor() const
-{
-    return num_arguments_per_tensor<1>();
-}
-
-unsigned int ICLKernel::num_arguments_per_2D_tensor() const
-{
-    return num_arguments_per_tensor<2>();
-}
-
-unsigned int ICLKernel::num_arguments_per_3D_tensor() const
-{
-    return num_arguments_per_tensor<3>();
-}
-
-unsigned int ICLKernel::num_arguments_per_4D_tensor() const
-{
-    return num_arguments_per_tensor<4>();
-}
+template void ICLKernel::add_tensor_argument<1>(unsigned &idx, const ICLTensor *tensor, const Window &window);
+template void ICLKernel::add_tensor_argument<2>(unsigned &idx, const ICLTensor *tensor, const Window &window);
+template void ICLKernel::add_tensor_argument<3>(unsigned &idx, const ICLTensor *tensor, const Window &window);
+template void ICLKernel::add_tensor_argument<4>(unsigned &idx, const ICLTensor *tensor, const Window &window);
 
 void ICLKernel::set_target(cl::Device &device)
 {
     _target = get_target_from_device(device);
 }
 
-void ICLKernel::set_target(GPUTarget target)
-{
-    _target = target;
-}
-
-GPUTarget ICLKernel::get_target() const
-{
-    return _target;
-}
-
 size_t ICLKernel::get_max_workgroup_size()
 {
     if(_max_workgroup_size == 0)

diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 726279c..06d10a4 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -108,6 +108,9 @@
     LOAD_FUNCTION_PTR(clRetainEvent, handle);
     LOAD_FUNCTION_PTR(clGetPlatformIDs, handle);
     LOAD_FUNCTION_PTR(clGetKernelWorkGroupInfo, handle);
+    LOAD_FUNCTION_PTR(clGetCommandQueueInfo, handle);
+    LOAD_FUNCTION_PTR(clGetKernelInfo, handle);
+    LOAD_FUNCTION_PTR(clGetEventProfilingInfo, handle);
 
 #undef LOAD_FUNCTION_PTR
 
@@ -729,3 +732,60 @@
         return CL_OUT_OF_RESOURCES;
     }
 }
+
+cl_int
+clGetCommandQueueInfo(cl_command_queue      command_queue,
+                      cl_command_queue_info param_name,
+                      size_t                param_value_size,
+                      void                 *param_value,
+                      size_t               *param_value_size_ret)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clGetCommandQueueInfo_ptr;
+    if(func != nullptr)
+    {
+        return func(command_queue, param_name, param_value_size, param_value, param_value_size_ret);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int
+clGetKernelInfo(cl_kernel      kernel,
+                cl_kernel_info param_name,
+                size_t         param_value_size,
+                void          *param_value,
+                size_t        *param_value_size_ret)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clGetKernelInfo_ptr;
+    if(func != nullptr)
+    {
+        return func(kernel, param_name, param_value_size, param_value, param_value_size_ret);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int
+clGetEventProfilingInfo(cl_event          event,
+                        cl_profiling_info param_name,
+                        size_t            param_value_size,
+                        void             *param_value,
+                        size_t           *param_value_size_ret)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clGetEventProfilingInfo_ptr;
+    if(func != nullptr)
+    {
+        return func(event, param_name, param_value_size, param_value, param_value_size_ret);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/activation_layer_qa8.cl b/src/core/CL/cl_kernels/activation_layer_qa8.cl
index 02668f7..cb31e99 100644
--- a/src/core/CL/cl_kernels/activation_layer_qa8.cl
+++ b/src/core/CL/cl_kernels/activation_layer_qa8.cl

@@ -25,10 +25,15 @@
 
 #define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 
+// RELU Activation
+inline TYPE relu_op(TYPE x)
+{
+    return max((TYPE)CONST_0, x);
+}
 // Bounded RELU Activation
 inline TYPE brelu_op(TYPE x)
 {
-    return min((TYPE)A_VAL, max(0, x));
+    return min((TYPE)A_VAL, max(CONST_0, x));
 }
 // Lower Upper Bounded RELU Activation
 inline TYPE lu_brelu_op(TYPE x)
@@ -49,6 +54,7 @@
  * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
  * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
  * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively.
+ * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
  *
  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/asymm_helper.h b/src/core/CL/cl_kernels/asymm_helper.h
deleted file mode 100644
index 18c1475..0000000
--- a/src/core/CL/cl_kernels/asymm_helper.h
+++ /dev/null

@@ -1,275 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ASYMM_HELPER_H
-#define ARM_COMPUTE_ASYMM_HELPER_H
-
-// Algoriths for these functions were taken from
-// https://github.com/google/gemmlowp/blob/master/fixedpoint/fixedpoint.h
-// and adapted to operate on integer vectors.
-
-/** For each element of input vector, the corresponding bits of the result item are set
- * if the input item is zero.
- *
- * @param[in] a Input vector whose zero bits define which corresponding bits in result will be set.
- *
- * @returns Output vector with bits set when corresponding bit in @p a is zero.
- */
-inline int16 asymm_mask_if_zero(int16 a)
-{
-    const int16 all_zeros = 0;
-    const int16 all_ones  = ~0;
-    return select(all_zeros, all_ones, a == 0);
-}
-
-/** For each element of input vector, the corresponding bits of the result item are set
- * if the input item is non-zero.
- *
- * @param[in] a Input vector whose non-zero bits define which corresponding bits in result will be set.
- *
- * @returns Output vector with bits set when corresponding bit in @p a is non zero.
- */
-inline int16 asymm_mask_if_non_zero(int16 a)
-{
-    const int16 all_zeros = 0;
-    const int16 all_ones  = ~0;
-    return select(all_zeros, all_ones, a != 0);
-}
-
-/** Each bit of the result is set to the corresponding bit of either then_val or
- * else_val depending on whether the corresponding bit of if_mask is set.
- * Equivalent to the VBSL instruction in ARM NEON.
- *
- * @param[in] if_mask  Mask defines will bit be taken from @p then_val or @p else_val depending on corresponding bit in mask is set or not.
- * @param[in] then_val Value whose bit will be used for result when corresponding bit in @p if_mask is set.
- * @param[in] else_val Value whose bit will be used for result when corresponding bit in @p if_mask is not set.
- *
- * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
- */
-inline int16 asymm_select_using_mask(int16 if_mask, int16 then_val, int16 else_val)
-{
-    return (if_mask & then_val) ^ (~if_mask & else_val);
-}
-
-/** Correctly rounded to nearest division by a power of two.
- * Also known as a rounding arithmetic right shift.
- *
- * @param[in] x        Value needed to be divided by power of two.
- * @param[in] exponent Power of two, must be positive number.
- *
- * @return Arithmetic right shift.
- */
-inline int16 asymm_rounding_divide_by_pow2(int16 x, int exponent)
-{
-    int16       mask      = (1 << exponent) - 1;
-    const int16 zero      = 0;
-    const int16 one       = 1;
-    int16       threshold = (mask >> 1) + select(zero, one, x < 0);
-    return (x >> exponent) + select(zero, one, (x & mask) > threshold);
-}
-
-/** Calculates the product of a integer value by a power of two, with either a positive exponent
- * (equivalent to an arithmetic left shift, saturating) or a negative exponent
- * (equivalent to an arithmetic right shift, rounding to nearest).
- *
- * @param[in] x        Value needed to be multiplied or divided by power of two depending on sign of @p exponent.
- * @param[in] exponent Power of two, can be positive or negative number.
- *
- * @return Arithmetic left or right shift.
- */
-inline int16 asymm_saturating_rounding_mult_by_pow2(int16 x, int exponent)
-{
-    if(exponent < 0)
-    {
-        return asymm_rounding_divide_by_pow2(x, -exponent);
-    }
-
-    const int16 min           = INT_MIN;
-    const int16 max           = INT_MAX;
-    int         threshold     = ((1 << (31 - exponent)) - 1);
-    int16       positive_mask = asymm_mask_if_non_zero(x > threshold);
-    int16       negative_mask = asymm_mask_if_non_zero(x < -threshold);
-    int16       result        = x << exponent;
-    result                    = asymm_select_using_mask(positive_mask, max, result);
-    result                    = asymm_select_using_mask(negative_mask, min, result);
-    return result;
-}
-
-/** Calculates (a+b)/2, rounded to the nearest integer.
- * Equivalent to VRHADD in the ARM NEON instruction set.
- *
- * @param[in] a First term of half-sum.
- * @param[in] b Second term of half-sum.
- *
- * @return (a+b)/2, rounded to the nearest integer.
- */
-inline int16 asymm_rounding_half_sum(int16 a, int16 b)
-{
-    long16       a64       = convert_long16(a);
-    long16       b64       = convert_long16(b);
-    long16       sum       = a64 + b64;
-    const long16 one       = 1;
-    const long16 minus_one = -1;
-    long16       sign      = select(minus_one, one, sum >= 0);
-    return convert_int16((sum + sign) / 2);
-}
-
-/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
- * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
- * This is equivalent to the VQRDMULH instruction in ARM NEON.
- *
- * @param[in] a First term of product.
- * @param[in] b Second term of product.
- *
- * @return Product of two numbers.
- */
-inline int16 asymm_saturating_rounding_doubling_high_mul(int16 a, int16 b)
-{
-    int16  overflow     = (a == b) && (a == INT_MIN);
-    long16 a_64         = convert_long16(a);
-    long16 b_64         = convert_long16(b);
-    long16 ab_64        = a_64 * b_64;
-    long16 mask1        = 1 << 30;
-    long16 mask2        = 1 - (1 << 30);
-    long16 nudge        = select(mask2, mask1, ab_64 >= 0);
-    long16 mask         = 1ll << 31;
-    int16  ab_x2_high32 = convert_int16((ab_64 + nudge) / mask);
-    return select(ab_x2_high32, INT_MAX, overflow);
-}
-
-/** Fixed-point multiplication.
- *
- * @param[in] a Argument 1 in fixed-point format Q(a).
- * @param[in] b Argument 2 in fixed-point format Q(b).
- *
- * @return Result in fixed-point format Q(a+b).
- */
-inline int16 asymm_mult(int16 a, int16 b)
-{
-    return asymm_saturating_rounding_doubling_high_mul(a, b);
-}
-
-/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
- *
- * @param[in] a Argument in fixed-point format Q0.
- *
- * @return Result in fixed-point format Q0.
- */
-inline int16 asymm_exp_on_interval_between_negative_one_quarter_and_0_excl(int16 a)
-{
-    const int16 constant_term                            = 1895147668;
-    const int16 constant_1_over_3                        = 715827883;
-    const int   k_fractional_bits                        = 31;
-    int16       x                                        = a + (1 << (k_fractional_bits - 3));
-    int16       x2                                       = asymm_mult(x, x);
-    int16       x3                                       = asymm_mult(x2, x);
-    int16       x4                                       = asymm_mult(x2, x2);
-    int16       x4_over_4                                = asymm_rounding_divide_by_pow2(x4, 2);
-    int16       x4_over_24_plus_x3_over_6_plus_x2        = asymm_mult((x4_over_4 + x3), constant_1_over_3) + x2;
-    int16       x4_over_24_plus_x3_over_6_plus_x2_over_2 = asymm_rounding_divide_by_pow2(x4_over_24_plus_x3_over_6_plus_x2, 1);
-    return constant_term + asymm_mult(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2);
-}
-
-/** Calculates \f$ exp(x) \f$ for x < 0.
- *
- * @param[in] a              Argument in fixed-point format Q(k_integer_bits).
- * @param[in] k_integer_bits Number of integer bit in argument.
- *
- * @return Result in fixed-point format Q0.
- */
-inline int16 asymm_exp_on_negative_values(int16 a, int k_integer_bits)
-{
-    const int k_fractional_bits                      = 31 - k_integer_bits;
-    int16     k_one_quarter                          = 1 << (k_fractional_bits - 2);
-    int16     mask                                   = k_one_quarter - 1;
-    int16     a_mod_quarter_minus_one_quarter        = (a & mask) - k_one_quarter;
-    int16     a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;
-    int16     result                                 = asymm_exp_on_interval_between_negative_one_quarter_and_0_excl(a_mod_quarter_minus_one_quarter_scaled);
-    int16     remainder                              = a_mod_quarter_minus_one_quarter - a;
-
-#define EXP_BARREL_SHIFTER(Exponent, FixedPointMultiplier)                                       \
-    if(k_integer_bits > Exponent)                                                                \
-    {                                                                                            \
-        const int k_shift_amount = k_integer_bits > Exponent ? k_fractional_bits + Exponent : 0; \
-        result                   = asymm_select_using_mask(                                      \
-                                                                                                 asymm_mask_if_non_zero(remainder & (1 << k_shift_amount)),                           \
-                                                                                                 asymm_mult(result, FixedPointMultiplier), result);                                   \
-    }
-    EXP_BARREL_SHIFTER(-2, 1672461947);
-    EXP_BARREL_SHIFTER(-1, 1302514674);
-    EXP_BARREL_SHIFTER(+0, 790015084);
-    EXP_BARREL_SHIFTER(+1, 290630308);
-    EXP_BARREL_SHIFTER(+2, 39332535);
-    EXP_BARREL_SHIFTER(+3, 720401);
-    EXP_BARREL_SHIFTER(+4, 242);
-#undef EXP_BARREL_SHIFTER
-
-    if(k_integer_bits > 5)
-    {
-        const int16 clamp = -(1 << (k_fractional_bits + 5));
-        result            = asymm_select_using_mask(asymm_mask_if_non_zero(a < clamp), 0, result);
-    }
-
-    const int16 Q0_one = INT_MAX;
-    return asymm_select_using_mask(asymm_mask_if_zero(a), Q0_one, result);
-}
-
-/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
- *
- * @param[in] a Argument in fixed-point format Q0.
- *
- * @return Result in fixed-point format Q0.
- */
-inline int16 asymm_one_over_one_plus_x_for_x_in_0_1(int16 a)
-{
-    const int16 Q0_one            = INT_MAX;
-    const int16 Q2_one            = 1 << (31 - 2);
-    int16       half_denominator  = asymm_rounding_half_sum(a, Q0_one);
-    const int16 Q2_48_over_17     = 1515870810;
-    const int16 Q2_neg_32_over_17 = -1010580540;
-    int16       x                 = Q2_48_over_17 + asymm_mult(half_denominator, Q2_neg_32_over_17);
-    for(int i = 0; i < 3; i++)
-    {
-        int16 half_denominator_times_x           = asymm_mult(half_denominator, x);
-        int16 one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;
-        int16 tmp                                = asymm_mult(x, one_minus_half_denominator_times_x);
-        x                                        = x + asymm_saturating_rounding_mult_by_pow2(tmp, 2);
-    }
-    return asymm_saturating_rounding_mult_by_pow2(x, 1);
-}
-
-/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
- *
- * @param[in] value            Value to be rescaled.
- * @param[in] src_integer_bits Old number of integer bits.
- * @param[in] dst_integer_bits New number of integer bits.
- *
- * @return Rescaled value.
- */
-inline int16 asymm_rescale(int16 value, int src_integer_bits, int dst_integer_bits)
-{
-    int exponent = src_integer_bits - dst_integer_bits;
-    return asymm_saturating_rounding_mult_by_pow2(value, exponent);
-}
-
-#endif // ARM_COMPUTE_ASYMM_HELPER_H

diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index fbffefb..5ddeb1a 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl

@@ -23,6 +23,8 @@
  */
 #include "helpers.h"
 
+#if defined(VEC_SIZE) && defined(DATA_TYPE)
+
 #if defined(FIXED_POINT_POSITION)
 #include "fixed_point.h"
 
@@ -42,6 +44,16 @@
 
 #endif /* FIXED_POINT_POSITION */
 
+#if defined(LU_BRELU)
+#define ACTIVATION_FUNC(x) CLAMP(x, (DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL)
+#elif defined(BRELU)
+#define ACTIVATION_FUNC(x) CLAMP(x, (DATA_TYPE)0, (DATA_TYPE)A_VAL)
+#elif defined(RELU)
+#define ACTIVATION_FUNC(x) max(x, (DATA_TYPE)0)
+#else /* FUSED_ACT */
+#define ACTIVATION_FUNC(x) (x)
+#endif /* FUSED_ACT */
+
 /** Apply batch normalization.
  *
  * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: QS8/QS16/F16/F32
@@ -126,6 +138,13 @@
     gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
     beta_vec  = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
 
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res = ADD_OP(MUL_OP(gamma_vec, x_bar), beta_vec);
+
+    res = ACTIVATION_FUNC(res);
+
     VSTORE(VEC_SIZE)
-    (ADD_OP(MUL_OP(gamma_vec, x_bar), beta_vec), 0, (__global DATA_TYPE *)out.ptr);
+    (res, 0, (__global DATA_TYPE *)out.ptr);
 }
+
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) */
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/channel_extract.cl b/src/core/CL/cl_kernels/channel_extract.cl
index e95bda4..e99e9eb 100644
--- a/src/core/CL/cl_kernels/channel_extract.cl
+++ b/src/core/CL/cl_kernels/channel_extract.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -268,5 +268,5 @@
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
     // Copy plane data
-    vstore16(vload16(0, src.ptr), 0, dst.ptr);
+    vstore8(vload8(0, src.ptr), 0, dst.ptr);
 }

diff --git a/src/core/CL/cl_kernels/col2im.cl b/src/core/CL/cl_kernels/col2im.cl
new file mode 100644
index 0000000..9b5a7b5
--- /dev/null
+++ b/src/core/CL/cl_kernels/col2im.cl

@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+#endif // FIXED_POINT_POSITION
+
+#if defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT)
+#if !defined(FIXED_POINT_POSITION)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not support"
+#endif // ELEMENT_SIZE
+
+/** This kernel performs a reshaping of the output of the convolution layer
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of the input tensor must be passed at compile time using -DWIDTH_INPUT: e.g. -DWIDTH_INPUT=320
+ * @note The width of the output tensor must be passed at compile time using -DWIDTH_OUTPUT: e.g. -DWIDTH_OUTPUT=600
+ * @note The element size must be passed at compile time using -DELEMENT_SIZE: e.g. -DELEMENT_SIZE=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void col2im(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint dst_stride_w)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    uint  x         = get_global_id(0) * 8;
+    uint8 x_clamped = x + (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
+
+    VEC_DATA_TYPE(COND_DATA_TYPE, 8)
+    cond0 = CONVERT((x_clamped < WIDTH_INPUT), VEC_DATA_TYPE(COND_DATA_TYPE, 8));
+
+    // Clamp x if out-of-bounds
+    x_clamped = select((uint8)x, x_clamped, convert_int8(cond0));
+
+    // If out-of-bound, overwrite with the first element
+    data = select((VEC_DATA_TYPE(DATA_TYPE, 8))data.s0, data, cond0);
+
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes;
+
+    // Compute output offset
+    int idx = (get_global_id(1) / WIDTH_OUTPUT) * dst_stride_y + (get_global_id(1) % WIDTH_OUTPUT) * dst_stride_x + get_global_id(2) * dst_stride_w;
+
+    // Store value
+    *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s0 * dst_stride_z)) = data.s0;
+    *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s1 * dst_stride_z)) = data.s1;
+    *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s2 * dst_stride_z)) = data.s2;
+    *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s3 * dst_stride_z)) = data.s3;
+    *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s4 * dst_stride_z)) = data.s4;
+    *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s5 * dst_stride_z)) = data.s5;
+    *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s6 * dst_stride_z)) = data.s6;
+    *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s7 * dst_stride_z)) = data.s7;
+}
+#else  // !defined(FIXED_POINT_POSITION)
+/** This kernel performs a reshaping of the output of the convolution layer.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=qs8
+ * @note The width of the output tensor must be passed at compile time using -DWIDTH_OUTPUT: e.g. -DWIDTH_OUTPUT=320
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void col2im(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint dst_stride_w)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
+
+    // Compute output offset
+    int idx = get_global_id(0) * dst.stride_z + (get_global_id(1) / WIDTH_OUTPUT) * dst_stride_y + (get_global_id(1) % WIDTH_OUTPUT) * dst_stride_x + get_global_id(2) * dst_stride_w;
+
+    // Store value
+    *((__global DATA_TYPE *)(dst.ptr + idx)) = *((__global DATA_TYPE *)(src.ptr));
+}
+#endif // !defined(FIXED_POINT_POSITION)
+#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT)
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl
index 77b9b64..f8e0c27 100644
--- a/src/core/CL/cl_kernels/convolution_layer.cl
+++ b/src/core/CL/cl_kernels/convolution_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "fixed_point.h"
 #endif // FIXED_POINT_POSITION
 
+#if defined(DATA_TYPE)
 /** This kernel reshapes the tensor's low three dimensions to single column
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
@@ -96,319 +97,4 @@
         }
     }
 }
-
-#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PAD_VALUE)
-/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The value to use for the paddings must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col_generic(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
-    const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
-    const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
-    const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
-
-    // Calculate input indices
-    const int xi = xc * STRIDE_X - PAD_LEFT;
-    const int yi = yc * STRIDE_Y - PAD_TOP;
-
-    // Calculate output indices
-    const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
-    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-
-    __global uchar *input_ptr      = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
-    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
-
-    // Linearize convolution elements
-    for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
-    {
-        for(int x = xi, x_e = xi + KERNEL_WIDTH; x < x_e; ++x, ++output_ptr)
-        {
-#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
-            *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
-#else  // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
-            if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
-            {
-                *output_ptr = PAD_VALUE;
-            }
-            else
-            {
-                *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
-            }
-#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
-        }
-    }
-
-#ifdef HAS_BIAS
-    if(ch == (KERNEL_DEPTH - 1))
-    {
-#ifdef FIXED_POINT_POSITION
-        *output_ptr = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
-#else  // FIXED_POINT_POSITION
-        *output_ptr       = 1.0f;
-#endif // FIXED_POINT_POSITION
-    }
-#endif // HAS_BIAS
-}
-
-/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 3x3 and pad_x = pad_y = 0
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col_kernel3x3_padx0_pady0(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
-    const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
-    const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
-    const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
-
-    // Calculate input indices
-    const int xi = xc * STRIDE_X;
-    const int yi = yc * STRIDE_Y;
-
-    // Calculate output indices
-    const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
-    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-
-    // Get input and output address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
-
-    __global DATA_TYPE *output_ptr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w) + xo;
-
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
-
-    vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, output_ptr);
-    *(output_ptr + 8) = row2.s2;
-
-#ifdef HAS_BIAS
-    if(ch == (KERNEL_DEPTH - 1))
-    {
-#ifdef FIXED_POINT_POSITION
-        *(output_ptr + 9) = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
-#else  // FIXED_POINT_POSITION
-        *(output_ptr + 9) = 1.0f;
-#endif // FIXED_POINT_POSITION
-    }
-#endif // HAS_BIAS
-}
-#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
-
-#if defined(WIDTH_OUTPUT)
-/** This kernel performs a reshaping of the output of the convolution layer.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void col2im(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint dst_stride_w)
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
-
-    // Compute output offset
-    int idx = get_global_id(0) * dst.stride_z + (get_global_id(1) / WIDTH_OUTPUT) * dst_stride_y + (get_global_id(1) % WIDTH_OUTPUT) * dst_stride_x + get_global_id(2) * dst_stride_w;
-
-    // Store value
-    *((__global DATA_TYPE *)(dst.ptr + idx)) = *((__global DATA_TYPE *)(src.ptr));
-}
-#endif // defined(WIDTH_OUTPUT)
-
-/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note In case biases will be added in late stage, -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  width                             The width of the input tensor
- * @param[in]  height                            The height of the input tensor
- */
-__kernel void im2col_reduced(
-    TENSOR3D_DECLARATION(src),
-    VECTOR_DECLARATION(dst),
-    uint width, uint height)
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
-    const uint image_size = width * height;
-
-    __global uchar *tmp_out_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * width + get_global_id(2) * image_size) * dst_stride_x;
-
-    *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)src.ptr);
-
-#ifdef HAS_BIAS
-    // If it is the last thread in the 3 dimensional workgroup
-    if(get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1))
-    {
-        tmp_out_ptr += dst_stride_x;
-#ifdef FIXED_POINT_POSITION
-        *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
-#else  // FIXED_POINT_POSITION
-        *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)1;
-#endif // FIXED_POINT_POSITION
-    }
-#endif // HAS_BIAS
-}
-
-#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
-/** This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when
- * the kernel width is greater than 1 (except when the kernel size is 3x3) and pad_x == pad_y == 0.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.
- * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col_generic_padx0_pady0(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
-    const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
-    const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
-    const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
-
-    // Calculate input indices
-    const int xi = xc * STRIDE_X;
-    const int yi = yc * STRIDE_Y;
-    // Calculate output indices
-    const int xo                   = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
-    const int yo                   = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-    __global uchar *input_ptr      = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
-    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
-    // Linearize convolution elements
-    for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
-    {
-        int last_x = 0;
-        for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE)
-        {
-            VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-            row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
-            VSTORE(VECTOR_SIZE)
-            (row, 0, output_ptr);
-            last_x = x;
-        }
-        // Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE).
-        // Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit.
-#if WIDTH_MOD_VECTOR_SIZE == 1
-        *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
-#elif WIDTH_MOD_VECTOR_SIZE > 1
-        VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE)
-        row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
-        VSTORE(WIDTH_MOD_VECTOR_SIZE)
-        (row, 0, output_ptr);
-#endif /* WIDTH_MOD_VECTOR_SIZE */
-        output_ptr += WIDTH_MOD_VECTOR_SIZE;
-    } /* End of loop over KERNEL_HEIGHT */
-
-#ifdef HAS_BIAS
-    if(ch == (KERNEL_DEPTH - 1))
-    {
-#ifdef FIXED_POINT_POSITION
-        *output_ptr = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
-#else  // FIXED_POINT_POSITION
-        *output_ptr       = 1.0f;
-#endif // FIXED_POINT_POSITION
-    }
-#endif // HAS_BIAS
-}
-#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+#endif // defined(DATA_TYPE)
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index 89555a0..f352138 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -144,9 +144,9 @@
     return pixels;
 }
 
-/** This function computes the horizontal integral of the image.
+/** This OpenCL kernel computes the depthwise convolution 3x3
  *
- * @param[in] src_ptr                               Pointer to the source image. Supported data types: U8
+ * @param[in] src_ptr                               Pointer to the source image. Supported data types: F32
  * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
@@ -154,7 +154,7 @@
  * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
  * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
@@ -162,7 +162,7 @@
  * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F16/F32
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
  * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
@@ -175,7 +175,6 @@
  * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
  */
-
 __kernel void depthwise_convolution_3x3(
     TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(dst),
@@ -207,9 +206,214 @@
 
     vstore2(pixels, 0, (__global float *)dst.ptr);
 }
-
 #endif //defined(CONV_STRIDE_X)
 
+#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \
+    ({                                                             \
+        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);            \
+        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);            \
+        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);            \
+        acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1);            \
+        acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1);            \
+        acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1);            \
+    })
+
+#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
+    ({                                                                   \
+        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);                  \
+        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);                  \
+        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);                  \
+        acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1);                  \
+        acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1);                  \
+        acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1);                  \
+    })
+
+/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
+ * stride_x and stride_y are equal to 1
+ *
+ * @param[in] src_ptr                               Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F32
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ */
+__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif //defined(HAS_BIAS)
+)
+{
+    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+
+    float2 pixels0 = 0.0f;
+    float2 pixels1 = 0.0f;
+    float2 pixels2 = 0.0f;
+    float2 pixels3 = 0.0f;
+
+    __global uchar *weights_addr = (__global uchar *)weights.ptr;
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    // Load the weights
+    float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+    float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+    float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+
+    // Note: Since each work-item computes 4x2 elements, we need to load 4 rows from the input tensor
+    float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
+    float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
+    float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
+    float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
+    float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row3
+    float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row3
+
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
+
+    pixels0 += (float2)bias;
+    pixels1 += (float2)bias;
+    pixels2 += (float2)bias;
+    pixels3 += (float2)bias;
+#endif /* defined(HAS_BIAS) */
+
+    vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+    vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+    vstore2(pixels2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
+    vstore2(pixels3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
+}
+
+/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
+ * stride_x and stride_y are equal to 2
+ *
+ * @param[in] src_ptr                               Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F32
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ */
+__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif //defined(HAS_BIAS)
+)
+{
+    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+
+    float2 pixels0 = 0.0f;
+    float2 pixels1 = 0.0f;
+
+    __global uchar *weights_addr = (__global uchar *)weights.ptr;
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    // Load the weights
+    float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+    float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+    float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+
+    // Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor
+    float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
+    float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
+    float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
+    float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
+    float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
+    float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
+    float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
+    float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
+    float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
+    float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
+
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);
+    CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
+
+    pixels0 += (float2)bias;
+    pixels1 += (float2)bias;
+#endif /* defined(HAS_BIAS) */
+
+    vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+    vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+}
+
 #if defined(SRC_WIDTH) && defined(DATA_TYPE)
 /** This kernel reshapes each of the tensor's low three dimensions to single rows.
  *
@@ -265,7 +469,7 @@
 }
 #endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
 
-#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
+#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE)
 /** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
@@ -288,7 +492,6 @@
  * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-
 __kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
 {
     Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
@@ -310,7 +513,7 @@
         {
             if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
             {
-                *output_ptr = 0;
+                *output_ptr = PAD_VALUE;
             }
             else
             {
@@ -323,7 +526,7 @@
 #endif // defined(HAS_BIAS)
 }
 
-#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE)
+#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE)
 
 #if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
 
@@ -361,3 +564,187 @@
 }
 
 #endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#if defined(CONV_STRIDE_X)
+#if CONV_STRIDE_X == 1
+#define convolution1x3_f16 convolution1x3_stride_1_f16
+#elif CONV_STRIDE_X == 2
+#define convolution1x3_f16 convolution1x3_stride_2_f16
+#elif CONV_STRIDE_X == 3
+#define convolution1x3_f16 convolution1x3_stride_3_f16
+#else /* CONV_STRIDE_X */
+#error "Stride not supported"
+#endif /* CONV_STRIDE_X */
+
+/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.
+ *
+ * @param[in] left_pixel   Pointer to the left pixel.
+ * @param[in] left_coeff   Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff  Weight of the right pixel
+ *
+ * @return a half4 containing 4 convoluted values.
+ */
+inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,
+                                         const half            left_coeff,
+                                         const half            middle_coeff,
+                                         const half            right_coeff)
+{
+    half8 temp = vload8(0, (__global half *)left_pixel);
+
+    half4 left   = CONVERT(temp.s0123, half4);
+    half4 middle = CONVERT(temp.s1234, half4);
+    half4 right  = CONVERT(temp.s2345, half4);
+
+    return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
+}
+
+/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.
+ *
+ * @param[in] left_pixel   Pointer to the left pixel.
+ * @param[in] left_coeff   Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff  Weight of the right pixel
+ *
+ * @return a half4 containing 4 convoluted values.
+ */
+inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,
+                                         const half            left_coeff,
+                                         const half            middle_coeff,
+                                         const half            right_coeff)
+{
+    half8 temp0 = vload8(0, (__global half *)left_pixel);
+    half temp1  = *((__global half *)(left_pixel + 8 * sizeof(half)));
+
+    half4 left   = CONVERT(temp0.s0246, half4);
+    half4 middle = CONVERT(temp0.s1357, half4);
+    half4 right  = CONVERT((half4)(temp0.s246, temp1), half4);
+
+    return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
+}
+
+/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.
+ *
+ * @param[in] left_pixel   Pointer to the left pixel.
+ * @param[in] left_coeff   Weight of the left pixel
+ * @param[in] middle_coeff Weight of the middle pixel
+ * @param[in] right_coeff  Weight of the right pixel
+ *
+ * @return a half4 containing 4 convoluted values.
+ */
+inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,
+                                         const half            left_coeff,
+                                         const half            middle_coeff,
+                                         const half            right_coeff)
+{
+    half16 temp0 = vload16(0, (__global half *)left_pixel);
+
+    half4 left   = CONVERT(temp0.s0369, half4);
+    half4 middle = CONVERT(temp0.s147A, half4);
+    half4 right  = CONVERT(temp0.s258B, half4);
+
+    return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
+}
+
+/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.
+ *
+ * Convolution matrix layout:
+ *
+ * [ mat0, mat1, mat2 ]\n
+ * [ mat3, mat4, mat5 ]\n
+ * [ mat6, mat7, mat8 ]\n
+ *
+ * @param[in] src  A pointer to source Image structure
+ * @param[in] mat0 Coefficient from the convolution matrix
+ * @param[in] mat1 Coefficient from the convolution matrix
+ * @param[in] mat2 Coefficient from the convolution matrix
+ * @param[in] mat3 Coefficient from the convolution matrix
+ * @param[in] mat4 Coefficient from the convolution matrix
+ * @param[in] mat5 Coefficient from the convolution matrix
+ * @param[in] mat6 Coefficient from the convolution matrix
+ * @param[in] mat0 Coefficient from the convolution matrix
+ * @param[in] mat7 Coefficient from the convolution matrix
+ * @param[in] mat8 Coefficient from the convolution matrix
+ *
+ * @return a half4 containing 4 convoluted values.
+ */
+inline half4 convolution3x3_f16(
+    Image     *src,
+    const half mat0, const half mat1, const half mat2,
+    const half mat3, const half mat4, const half mat5,
+    const half mat6, const half mat7, const half mat8)
+{
+    half4 pixels;
+
+    pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);
+    pixels += convolution1x3_f16(offset(src, 0, 1), mat3, mat4, mat5);
+    pixels += convolution1x3_f16(offset(src, 0, 2), mat6, mat7, mat8);
+
+    return pixels;
+}
+
+/** This OpenCL kernel computes the depthwise convolution 3x3
+ *
+ * @param[in] src_ptr                               Pointer to the source image. Supported data types: F16
+ * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F16/F32
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ */
+__kernel void depthwise_convolution_3x3_f16(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif //defined(HAS_BIAS)
+)
+{
+    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+#if defined(HAS_BIAS)
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif //defined(HAS_BIAS)
+
+    uchar3 offset         = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
+    half3 weights_values0 = vload3(0, (__global half *)(weights.ptr + offset.s0));
+    half3 weights_values1 = vload3(0, (__global half *)(weights.ptr + offset.s1));
+    half3 weights_values2 = vload3(0, (__global half *)(weights.ptr + offset.s2));
+
+    half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
+                                      weights_values1.s0, weights_values1.s1, weights_values1.s2,
+                                      weights_values2.s0, weights_values2.s1, weights_values2.s2);
+#if defined(HAS_BIAS)
+    pixels += (half4)(*((__global half *)(biases.ptr + get_global_id(2) * biases_stride_x)));
+#endif //defined(HAS_BIAS)
+
+    vstore4(pixels, 0, (__global half *)dst.ptr);
+}
+#endif // defined(CONV_STRIDE_X)
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 8a757fc..40538a1 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl

@@ -24,159 +24,45 @@
 
 #include "helpers_asymm.h"
 
-#if defined(CONV_STRIDE_X)
+#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
+
+#if CONV_STRIDE_X > 3
+#error "Stride X not supported"
+#endif /* CONV_STRIDE_X > 3 */
 
 #if CONV_STRIDE_X == 1
-#define convolution1x3 convolution1x3_stride_1
+#define GET_VALUES(first_value, left, middle, right)                              \
+    ({                                                                            \
+        int8 temp0 = CONVERT(vload8(0, first_value), int8);                       \
+        int2 temp1 = CONVERT(vload2(0, (first_value + 8 * sizeof(uchar))), int2); \
+        \
+        left   = CONVERT(temp0.s01234567, int8);                                  \
+        middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8);        \
+        right  = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8);        \
+    })
 #elif CONV_STRIDE_X == 2
-#define convolution1x3 convolution1x3_stride_2
-#elif CONV_STRIDE_X == 3
-#define convolution1x3 convolution1x3_stride_3
+#define GET_VALUES(first_value, left, middle, right)                     \
+    ({                                                                   \
+        int16 temp0 = CONVERT(vload16(0, first_value), int16);           \
+        int   temp1 = CONVERT(*(first_value + 16 * sizeof(uchar)), int); \
+        \
+        left   = CONVERT(temp0.s02468ace, int8);                         \
+        middle = CONVERT(temp0.s13579bdf, int8);                         \
+        right  = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8);  \
+    })
 #else /* CONV_STRIDE_X */
-#error "Stride not supported"
+#define GET_VALUES(first_value, left, middle, right)                                \
+    ({                                                                              \
+        int16 temp0 = CONVERT(vload16(0, first_value), int16);                      \
+        int8  temp1 = CONVERT(vload8(0, (first_value + 16 * sizeof(uchar))), int8); \
+        \
+        left   = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);          \
+        middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8);          \
+        right  = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8);          \
+    })
 #endif /* CONV_STRIDE_X */
 
-/** Compute a 1D horizontal convolution of size 3 and stride 1 for uchar type.
- *
- * @param[in] left_pixel    Pointer to the left pixel.
- * @param[in] left_coeff    Weight of the left pixel
- * @param[in] middle_coeff  Weight of the middle pixel
- * @param[in] right_coeff   Weight of the right pixel
- * @param[in] input_offset  Quantized offset of zero point of the input tensor data range
- * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
- *
- * @return a int8 containing 8 convoluted values.
- */
-inline int8 convolution1x3_stride_1(__global const uchar *left_pixel,
-                                    const int             left_coeff,
-                                    const int             middle_coeff,
-                                    const int             right_coeff,
-                                    const int             input_offset,
-                                    const int             weight_offset)
-{
-    int8 temp0 = CONVERT(vload8(0, left_pixel), int8);
-    int2 temp1 = CONVERT(vload2(0, (left_pixel + 8 * sizeof(uchar))), int2);
-
-    int8 left   = CONVERT(temp0.s01234567, int8);
-    int8 middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8);
-    int8 right  = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8);
-
-    return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset);
-}
-
-/** Compute a 1D horizontal convolution of size 3 and stride 2 for uchar type.
- *
- * @param[in] left_pixel    Pointer to the left pixel.
- * @param[in] left_coeff    Weight of the left pixel
- * @param[in] middle_coeff  Weight of the middle pixel
- * @param[in] right_coeff   Weight of the right pixel
- * @param[in] input_offset  Quantized offset of zero point of the input tensor data range
- * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
- *
- * @return a int8 containing 8 convoluted values.
- */
-inline int8 convolution1x3_stride_2(__global const uchar *left_pixel,
-                                    const int             left_coeff,
-                                    const int             middle_coeff,
-                                    const int             right_coeff,
-                                    const int             input_offset,
-                                    const int             weight_offset)
-{
-    int16 temp0 = CONVERT(vload16(0, left_pixel), int16);
-    int   temp1 = CONVERT(*(left_pixel + 16 * sizeof(uchar)), int);
-
-    int8 left   = CONVERT(temp0.s02468ace, int8);
-    int8 middle = CONVERT(temp0.s13579bdf, int8);
-    int8 right  = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8);
-
-    return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset);
-}
-
-/** Compute a 1D horizontal convolution of size 3 and stride 3 for uchar type.
- *
- * @param[in] left_pixel    Pointer to the left pixel.
- * @param[in] left_coeff    Weight of the left pixel
- * @param[in] middle_coeff  Weight of the middle pixel
- * @param[in] right_coeff   Weight of the right pixel
- * @param[in] input_offset  Quantized offset of zero point of the input tensor data range
- * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
- *
- * @return a int8 containing 8 convoluted values.
- */
-inline int8 convolution1x3_stride_3(__global const uchar *left_pixel,
-                                    const int             left_coeff,
-                                    const int             middle_coeff,
-                                    const int             right_coeff,
-                                    const int             input_offset,
-                                    const int             weight_offset)
-{
-    int16 temp0 = CONVERT(vload16(0, left_pixel), int16);
-    int8  temp1 = CONVERT(vload8(0, (left_pixel + 16 * sizeof(uchar))), int8);
-
-    int8 left   = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);
-    int8 middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8);
-    int8 right  = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8);
-
-    return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset);
-}
-
-/** Apply a 3x3 convolution matrix to a single channel QASYMM8 input image and return the result.
- *
- * Convolution matrix layout:
- *
- * [ mat0, mat1, mat2 ]\n
- * [ mat3, mat4, mat5 ]\n
- * [ mat6, mat7, mat8 ]\n
- *
- * @param[in] src               A pointer to source Image structure
- * @param[in] mat0              Coefficient from the convolution matrix
- * @param[in] mat1              Coefficient from the convolution matrix
- * @param[in] mat2              Coefficient from the convolution matrix
- * @param[in] mat3              Coefficient from the convolution matrix
- * @param[in] mat4              Coefficient from the convolution matrix
- * @param[in] mat5              Coefficient from the convolution matrix
- * @param[in] mat6              Coefficient from the convolution matrix
- * @param[in] mat7              Coefficient from the convolution matrix
- * @param[in] mat8              Coefficient from the convolution matrix
- * @param[in] input_offset      Quantized offset of zero point of the input tensor data range
- * @param[in] weight_offset     Quantized offset of zero point of the weights tensor data range
- * @param[in] output_offset     Quantized offset of zero point of the output tensor data range
- * @param[in] output_multiplier Output scale multiplier
- * @param[in] output_shift      Output scale divisor exponent
- * @param[in] bias              (Optional) Bias value
- *
- * @return a uchar8 containing 8 convoluted values.
- */
-inline uchar8 convolution3x3(
-    Image      *src,
-    const uchar mat0, const uchar mat1, const uchar mat2,
-    const uchar mat3, const uchar mat4, const uchar mat5,
-    const uchar mat6, const uchar mat7, const uchar mat8,
-    const int input_offset, const int weight_offset, const int output_offset,
-    const int output_multiplier, const int output_shift
-#if defined(HAS_BIAS)
-    ,
-    const int bias
-#endif //defined(HAS_BIAS)
-)
-{
-    int8 pixels;
-
-    pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2, input_offset, weight_offset);
-    pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5, input_offset, weight_offset);
-    pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8, input_offset, weight_offset);
-#if defined(HAS_BIAS)
-    pixels += (int8)(bias);
-#endif //defined(HAS_BIAS)
-
-    pixels = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels, output_multiplier, output_shift, 8);
-    pixels = pixels + output_offset;
-    pixels = clamp(pixels, 0, 255);
-
-    return CONVERT(pixels, uchar8);
-}
-
-/** This function computes the horizontal integral of the image.
+/** This function computes the horizontal integral of the image and adds offsets.
  *
  * @param[in] src_ptr                               Pointer to the source image. Supported data types: QASYMM8
  * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
@@ -206,54 +92,147 @@
  * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
  * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
- * @param[in] input_offset                          Quantized offset of zero point of the input tensor data range
- * @param[in] weight_offset                         Quantized offset of zero point of the weights tensor data range
- * @param[in] output_offset                         Quantized offset of zero point of the output tensor data range
- * @param[in] output_multiplier                     Output scale multiplier
- * @param[in] output_shift                          Output scale divisor exponent
  */
 
 __kernel void depthwise_convolution_3x3_quantized(
     TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
+    TENSOR3D_DECLARATION(weights)
 #if defined(HAS_BIAS)
-    VECTOR_DECLARATION(biases),
+    ,
+    VECTOR_DECLARATION(biases)
 #endif //defined(HAS_BIAS)
-    int input_offset,
-    int weight_offset,
-    int output_offset,
-    int output_multiplier,
-    int output_shift)
+)
 {
     Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
     Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
     Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
 #if defined(HAS_BIAS)
     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-#endif //defined(HAS_BIAS)
 
-    uchar3 offset          = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
-    uchar3 weights_values0 = vload3(0, weights.ptr + offset.s0);
-    uchar3 weights_values1 = vload3(0, weights.ptr + offset.s1);
-    uchar3 weights_values2 = vload3(0, weights.ptr + offset.s2);
-
-#if defined(HAS_BIAS)
     int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));
 #endif //defined(HAS_BIAS)
 
-    uchar8 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
-                                   weights_values1.s0, weights_values1.s1, weights_values1.s2,
-                                   weights_values2.s0, weights_values2.s1, weights_values2.s2,
-                                   input_offset, weight_offset, output_offset,
-                                   output_multiplier, output_shift
-#if defined(HAS_BIAS)
-                                   ,
-                                   bias_value
-#endif //defined(HAS_BIAS)
-                                  );
+    uchar3 w0 = vload3(0, weights.ptr + 0 * weights_stride_y);
+    uchar3 w1 = vload3(0, weights.ptr + 1 * weights_stride_y);
+    uchar3 w2 = vload3(0, weights.ptr + 2 * weights_stride_y);
 
-    vstore8(pixels, 0, dst.ptr);
+    int8 values0 = 0;
+    int8 sum0    = 0;
+#if CONV_STRIDE_Y == 1
+    int8 values1 = 0;
+    int8 sum1    = 0;
+#endif /* CONV_STRIDE_Y */
+
+    // Row0
+    int8 left, middle, right;
+    GET_VALUES(src.ptr + 0 * src_stride_y, left, middle, right);
+    values0 += left * (int8)(w0.s0);
+    values0 += middle * (int8)(w0.s1);
+    values0 += right * (int8)(w0.s2);
+
+#if WEIGHTS_OFFSET != 0
+    sum0 += left + middle + right;
+#endif /* WEIGHTS_OFFSET != 0 */
+
+    // Row1
+    GET_VALUES(src.ptr + 1 * src_stride_y, left, middle, right);
+    values0 += left * (int8)(w1.s0);
+    values0 += middle * (int8)(w1.s1);
+    values0 += right * (int8)(w1.s2);
+#if CONV_STRIDE_Y == 1
+    values1 += left * (int8)(w0.s0);
+    values1 += middle * (int8)(w0.s1);
+    values1 += right * (int8)(w0.s2);
+#endif /* CONV_STRIDE_Y == 1 */
+
+#if WEIGHTS_OFFSET != 0
+    int8 tmp = left + middle + right;
+    sum0 += tmp;
+#if CONV_STRIDE_Y == 1
+    sum1 += tmp;
+#endif /* CONV_STRIDE_Y == 1 */
+#endif /* WEIGHTS_OFFSET != 0 */
+
+    // Row2
+    GET_VALUES(src.ptr + 2 * src_stride_y, left, middle, right);
+    values0 += left * (int8)(w2.s0);
+    values0 += middle * (int8)(w2.s1);
+    values0 += right * (int8)(w2.s2);
+#if CONV_STRIDE_Y == 1
+    values1 += left * (int8)(w1.s0);
+    values1 += middle * (int8)(w1.s1);
+    values1 += right * (int8)(w1.s2);
+#endif /* CONV_STRIDE_Y == 1 */
+
+#if WEIGHTS_OFFSET != 0
+    tmp = left + middle + right;
+    sum0 += tmp;
+#if CONV_STRIDE_Y == 1
+    sum1 += tmp;
+#endif /* CONV_STRIDE_Y == 1 */
+#endif /* WEIGHTS_OFFSET != 0 */
+
+#if CONV_STRIDE_Y == 1
+    // Row3
+    GET_VALUES(src.ptr + 3 * src_stride_y, left, middle, right);
+    values1 += left * (int8)(w2.s0);
+    values1 += middle * (int8)(w2.s1);
+    values1 += right * (int8)(w2.s2);
+
+#if WEIGHTS_OFFSET != 0
+    sum1 += left + middle + right;
+#endif /* WEIGHTS_OFFSET != 0 */
+#endif /* CONV_STRIDE_Y == 1 */
+
+#if defined(HAS_BIAS)
+    values0 += (int8)(bias_value);
+#if CONV_STRIDE_Y == 1
+    values1 += (int8)(bias_value);
+#endif /* CONV_STRIDE_Y == 1 */
+#endif //defined(HAS_BIAS)
+
+#if WEIGHTS_OFFSET != 0
+    values0 += sum0 * (int8)(WEIGHTS_OFFSET);
+#if CONV_STRIDE_Y == 1
+    values1 += sum1 * (int8)(WEIGHTS_OFFSET);
+#endif /* CONV_STRIDE_Y == 1 */
+#endif /* WEIGHTS_OFFSET != 0 */
+
+#if INPUT_OFFSET != 0
+    ushort  sum_weights = 0;
+    ushort3 tmp_we      = convert_ushort3(w0) + convert_ushort3(w1) + convert_ushort3(w2);
+    sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
+    values0 += sum_weights * (int8)(INPUT_OFFSET);
+#if CONV_STRIDE_Y == 1
+    values1 += sum_weights * (int8)(INPUT_OFFSET);
+#endif /* CONV_STRIDE_Y == 1 */
+#endif /* INPUT_OFFSET != 0 */
+
+#if K_OFFSET != 0
+    values0 += (int8)(K_OFFSET);
+#if CONV_STRIDE_Y == 1
+    values1 += (int8)(K_OFFSET);
+#endif /* CONV_STRIDE_Y == 1 */
+#endif /* K_OFFSET != 0 */
+
+    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+    values0 += (int8)OUTPUT_OFFSET;
+    uchar8 res0 = convert_uchar8_sat(values0);
+    res0        = max(res0, (uchar8)0);
+    res0        = min(res0, (uchar8)255);
+
+    vstore8(res0, 0, dst.ptr);
+#if CONV_STRIDE_Y == 1
+
+    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+    values1 += (int8)OUTPUT_OFFSET;
+    uchar8 res1 = convert_uchar8_sat(values1);
+    res1        = max(res1, (uchar8)0);
+    res1        = min(res1, (uchar8)255);
+
+    vstore8(res1, 0, dst.ptr + dst_stride_y);
+#endif /* CONV_STRIDE_Y == 1 */
 }
 
-#endif //defined(CONV_STRIDE_X)
+#endif /* defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) */

diff --git a/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl b/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl
index d0cf032..b58dc7a 100644
--- a/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl
+++ b/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -247,3 +247,62 @@
     vstore8(convert_uchar8_sat(pixels0), 0, (__global uchar *)dst.ptr);
 }
 #endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+/** This function computes the output stage of a depthwise convolution.
+ *
+ * @param[in] src_ptr                            Pointer to the source image. Supported data types: QASYMM8
+ * @param[in] src_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in] src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                         src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                            Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in] dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                         dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in] bias_ptr                           (Optional) Pointer to the biases vector. Supported data types: S32
+ * @param[in] bias_stride_x                      (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
+ * @param[in] output_offset                      Quantized offset of zero point of the output tensor data range
+ * @param[in] output_multiplier                  Output scale multiplier
+ * @param[in] output_shift                       Output scale divisor exponent
+ */
+
+__kernel void output_stage_quantized(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif //defined(HAS_BIAS)
+    int output_offset,
+    int output_multiplier,
+    int output_shift)
+{
+    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+#if defined(HAS_BIAS)
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+#endif //defined(HAS_BIAS)
+
+    // Load input
+    int16 vals = vload16(0, (__global int *)(src.ptr));
+
+#if defined(HAS_BIAS)
+    // Load and add bias
+    int bias_value = *((__global int *)(vector_offset(&bias, get_global_id(2))));
+    vals += (int16)(bias_value);
+#endif //defined(HAS_BIAS)
+
+    vals = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(vals, output_multiplier, output_shift, 16);
+    vals = vals + output_offset;
+
+    // Store result in dst
+    vstore16(convert_uchar16_sat(vals), 0, (__global uchar *)dst.ptr);
+}

diff --git a/src/core/CL/cl_kernels/fast_corners.cl b/src/core/CL/cl_kernels/fast_corners.cl
index 3e1929c..76b35b9 100644
--- a/src/core/CL/cl_kernels/fast_corners.cl
+++ b/src/core/CL/cl_kernels/fast_corners.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -254,6 +254,9 @@
             out[id].x               = get_global_id(0) + offset;
             out[id].y               = get_global_id(1) + offset;
             out[id].tracking_status = 1;
+            out[id].scale           = 0.f;
+            out[id].orientation     = 0.f;
+            out[id].error           = 0.f;
         }
     }
 }

diff --git a/src/core/CL/cl_kernels/fixed_point.h b/src/core/CL/cl_kernels/fixed_point.h
index d55346b..46fa645 100644
--- a/src/core/CL/cl_kernels/fixed_point.h
+++ b/src/core/CL/cl_kernels/fixed_point.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -476,19 +476,19 @@
 #define floatx16 float16
 #define float16_TYPE float16
 
-#define CONVERTQ_DOWN_IMPL(in_type, out_type)                                                                                      \
-    inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position)                                            \
-    {                                                                                                                              \
-        return CONVERT(a * (1 << fixed_point_position) + select((in_type)-0.5, (in_type)0.5, isgreater(a, (in_type)0)), out_type); \
+#define CONVERTQ_DOWN_IMPL(in_type, out_type)                                                                                        \
+    inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position)                                              \
+    {                                                                                                                                \
+        return CONVERT(a * (1 << fixed_point_position) + select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), out_type); \
     }
 
 CONVERTQ_DOWN_IMPL(float16, qs8x16)
 CONVERTQ_DOWN_IMPL(float16, qs16x16)
 
-#define CONVERTQ_DOWN_SAT_IMPL(in_type, out_type)                                                                                      \
-    inline out_type convert_##out_type##_##in_type##_sat(in_type a, int fixed_point_position)                                          \
-    {                                                                                                                                  \
-        return CONVERT_SAT(a * (1 << fixed_point_position) + select((in_type)-0.5, (in_type)0.5, isgreater(a, (in_type)0)), out_type); \
+#define CONVERTQ_DOWN_SAT_IMPL(in_type, out_type)                                                                                        \
+    inline out_type convert_##out_type##_##in_type##_sat(in_type a, int fixed_point_position)                                            \
+    {                                                                                                                                    \
+        return CONVERT_SAT(a * (1 << fixed_point_position) + select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), out_type); \
     }
 
 CONVERTQ_DOWN_SAT_IMPL(float16, qs8x16)

diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index c763cb3..58a550f 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,9 +27,24 @@
 #include "fixed_point.h"
 #endif // FIXED_POINT_POSITION
 
-/** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix
+#if defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)
+
+#if ELEMENT_SIZE == 1
+#define DATA_TYPE uchar
+#elif ELEMENT_SIZE == 2
+#define DATA_TYPE ushort
+#elif ELEMENT_SIZE == 4
+#define DATA_TYPE uint
+#else // ELEMENT_SIZE == 1
+#error "Element size not supported"
+#endif // ELEMENT_SIZE
+
+/** This OpenCL kernel computes the "vector" 1xW transposition of input matrix
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U32/S32/F32
+ * @note The transposition width must be passed at compile time using -DTRANSPOSE_W (i.e. -DTRANSPOSE_W)
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
@@ -37,12 +52,12 @@
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
  * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
  */
-__kernel void gemm_transpose1x4(IMAGE_DECLARATION(src),
+__kernel void gemm_transpose1xW(IMAGE_DECLARATION(src),
                                 IMAGE_DECLARATION(dst))
 {
     uint x = get_global_id(0);
@@ -52,16 +67,25 @@
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 
     // Compute address for Matrix B transposed - destination. X and Y are swapped
-    uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
+    uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + y * TRANSPOSE_W * sizeof(DATA_TYPE) * MULT_TRANSPOSE1XW_WIDTH + (x / MULT_TRANSPOSE1XW_WIDTH) * dst_stride_y +
+                             (x % MULT_TRANSPOSE1XW_WIDTH) * TRANSPOSE_W * sizeof(DATA_TYPE);
 
-    uint4 b0 = vload4(0, (__global uint *)src.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, TRANSPOSE_W)
+    b0 = VLOAD(TRANSPOSE_W)(0, (__global DATA_TYPE *)src.ptr);
 
-    vstore4(b0, 0, (__global uint *)(dst_ptr + dst_addr_in_bytes));
+    VSTORE(TRANSPOSE_W)
+    (b0, 0, (__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes));
 }
+#endif // defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)
 
-/** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix
+#if defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
+
+/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U16/S16/QS16/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
@@ -74,41 +98,10 @@
  * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
  */
-__kernel void gemm_transpose1x8(IMAGE_DECLARATION(src),
-                                IMAGE_DECLARATION(dst))
-{
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-
-    // Compute address for Matrix B - source
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-    // Compute address for Matrix B transposed - destination. X and Y are swapped
-    uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
-
-    ushort8 b0 = vload8(0, (__global ushort *)src.ptr);
-
-    vstore8(b0, 0, (__global ushort *)(dst_ptr + dst_addr_in_bytes));
-}
-
-/** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QS8
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_transpose1x16(IMAGE_DECLARATION(src),
+__kernel void gemm_interleave4x4(IMAGE_DECLARATION(src),
                                  IMAGE_DECLARATION(dst))
 {
+    // Compute source and destination addresses
     uint x = get_global_id(0);
     uint y = get_global_id(1);
 
@@ -116,145 +109,41 @@
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 
     // Compute address for Matrix B transposed - destination. X and Y are swapped
-    uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
-
-    uchar16 b0 = vload16(0, (__global uchar *)src.ptr);
-
-    vstore16(b0, 0, (__global uchar *)(dst_ptr + dst_addr_in_bytes));
-}
-
-/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U32/S32/F32
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_interleave4x4_32bit(IMAGE_DECLARATION(src),
-                                       IMAGE_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+    uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * 16 * MULT_INTERLEAVE4X4_HEIGHT + (y / MULT_INTERLEAVE4X4_HEIGHT) * dst_stride_y +
+                             (y % MULT_INTERLEAVE4X4_HEIGHT) * 4 * sizeof(DATA_TYPE);
 
     // Load values from Matrix A
-    uint4 a0 = vload4(0, (__global uint *)(offset(&src, 0, 0)));
-    uint4 a1 = vload4(0, (__global uint *)(offset(&src, 0, 1)));
-    uint4 a2 = vload4(0, (__global uint *)(offset(&src, 0, 2)));
-    uint4 a3 = vload4(0, (__global uint *)(offset(&src, 0, 3)));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    a0 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 0)));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    a1 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 1)));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    a2 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 2)));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    a3 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 3)));
 
-    uint4 val0 = (uint4)(a0.s0, a1.s0, a2.s0, a3.s0);
-    vstore4(val0, 0, ((__global uint *)dst.ptr) + 0);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0);
+    vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));
 
-    val0 = (uint4)(a0.s1, a1.s1, a2.s1, a3.s1);
-    vstore4(val0, 0, ((__global uint *)dst.ptr) + 4);
+    val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s1, a1.s1, a2.s1, a3.s1);
+    vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT));
 
-    val0 = (uint4)(a0.s2, a1.s2, a2.s2, a3.s2);
-    vstore4(val0, 0, ((__global uint *)dst.ptr) + 8);
+    val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s2, a1.s2, a2.s2, a3.s2);
+    vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT));
 
-    val0 = (uint4)(a0.s3, a1.s3, a2.s3, a3.s3);
-    vstore4(val0, 0, ((__global uint *)dst.ptr) + 12);
+    val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s3, a1.s3, a2.s3, a3.s3);
+    vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));
 }
+#endif // defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
 
-/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U16/S16/QS16/F16
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_interleave4x4_16bit(IMAGE_DECLARATION(src),
-                                       IMAGE_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values from Matrix A
-    ushort8 a0 = vload8(0, (__global ushort *)(offset(&src, 0, 0)));
-    ushort8 a1 = vload8(0, (__global ushort *)(offset(&src, 0, 1)));
-    ushort8 a2 = vload8(0, (__global ushort *)(offset(&src, 0, 2)));
-    ushort8 a3 = vload8(0, (__global ushort *)(offset(&src, 0, 3)));
-
-    ushort8 val0 = (ushort8)((ushort4)(a0.s0, a1.s0, a2.s0, a3.s0), (ushort4)(a0.s1, a1.s1, a2.s1, a3.s1));
-    vstore8(val0, 0, ((__global ushort *)dst.ptr) + 0);
-
-    val0 = (ushort8)((ushort4)(a0.s2, a1.s2, a2.s2, a3.s2), (ushort4)(a0.s3, a1.s3, a2.s3, a3.s3));
-    vstore8(val0, 0, ((__global ushort *)dst.ptr) + 8);
-
-    val0 = (ushort8)((ushort4)(a0.s4, a1.s4, a2.s4, a3.s4), (ushort4)(a0.s5, a1.s5, a2.s5, a3.s5));
-    vstore8(val0, 0, ((__global ushort *)dst.ptr) + 16);
-
-    val0 = (ushort8)((ushort4)(a0.s6, a1.s6, a2.s6, a3.s6), (ushort4)(a0.s7, a1.s7, a2.s7, a3.s7));
-    vstore8(val0, 0, ((__global ushort *)dst.ptr) + 24);
-}
-
-/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QS8
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
-                                      IMAGE_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values from Matrix A
-    uchar16 a0 = vload16(0, (__global uchar *)(offset(&src, 0, 0)));
-    uchar16 a1 = vload16(0, (__global uchar *)(offset(&src, 0, 1)));
-    uchar16 a2 = vload16(0, (__global uchar *)(offset(&src, 0, 2)));
-    uchar16 a3 = vload16(0, (__global uchar *)(offset(&src, 0, 3)));
-
-    uchar16 val0 = (uchar16)((uchar4)(a0.s0, a1.s0, a2.s0, a3.s0), (uchar4)(a0.s1, a1.s1, a2.s1, a3.s1),
-                             (uchar4)(a0.s2, a1.s2, a2.s2, a3.s2), (uchar4)(a0.s3, a1.s3, a2.s3, a3.s3));
-    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 0);
-
-    val0 = (uchar16)((uchar4)(a0.s4, a1.s4, a2.s4, a3.s4), (uchar4)(a0.s5, a1.s5, a2.s5, a3.s5),
-                     (uchar4)(a0.s6, a1.s6, a2.s6, a3.s6), (uchar4)(a0.s7, a1.s7, a2.s7, a3.s7));
-    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 16);
-
-    val0 = (uchar16)((uchar4)(a0.s8, a1.s8, a2.s8, a3.s8), (uchar4)(a0.s9, a1.s9, a2.s9, a3.s9),
-                     (uchar4)(a0.sA, a1.sA, a2.sA, a3.sA), (uchar4)(a0.sB, a1.sB, a2.sB, a3.sB));
-    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 32);
-
-    val0 = (uchar16)((uchar4)(a0.sC, a1.sC, a2.sC, a3.sC), (uchar4)(a0.sD, a1.sD, a2.sD, a3.sD),
-                     (uchar4)(a0.sE, a1.sE, a2.sE, a3.sE), (uchar4)(a0.sF, a1.sF, a2.sF, a3.sF));
-    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 48);
-}
-
-#if defined(COLS_B)
+#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
 /** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
- * @attention The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -270,30 +159,32 @@
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
 __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0),
                                                          IMAGE_DECLARATION(src1),
                                                          IMAGE_DECLARATION(dst))
 {
-    // src_addr.s0 = address of matrix A
-    // src_addr.s1 = address of matrix B
+    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
 
-    // Compute address for matrix A and B
-    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
-                                                                        (src1_stride_y));
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
 
-    // Add offset_first_element_in_bytes
-    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Divide by 4 in order to get the src_addr in unit of float
-    src_addr = src_addr >> 2;
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    __global float *src_addr_a = (__global float *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
 
     // Compute end row address for matrix B
-    int end_row_mtx_b = src_addr.s1 + COLS_B;
+    __global float *src_end_addr_b = src_addr_b + COLS_B;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
 
     // Reset accumulators
     float4 c00 = 0.0f;
@@ -301,11 +192,11 @@
     float4 c20 = 0.0f;
     float4 c30 = 0.0f;
 
-    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))
+    for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
-        float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
 
         c00 += (float4)a0.s0 * b0;
         c10 += (float4)a0.s1 * b0;
@@ -313,8 +204,8 @@
         c30 += (float4)a0.s3 * b0;
 
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);
-        b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);
+        a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+        b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
 
         c00 += (float4)a0.s0 * b0;
         c10 += (float4)a0.s1 * b0;
@@ -322,11 +213,11 @@
         c30 += (float4)a0.s3 * b0;
     }
 
-    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
-        float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
 
         c00 += (float4)a0.s0 * b0;
         c10 += (float4)a0.s1 * b0;
@@ -355,7 +246,9 @@
 /** This OpenCL kernel is optimized for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
- * @attention The number of matrix B columns and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -371,23 +264,33 @@
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
 __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
                                                          IMAGE_DECLARATION(src1),
                                                          IMAGE_DECLARATION(dst))
 {
+    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
+
     // src_addr_a = address of matrix A
     // src_addr_b = address of matrix B
-    __global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes);
-    __global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);
+    __global float *src_addr_a = (__global float *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
 
     // Compute end row address for matrix B
     __global float *src_end_addr_b = src_addr_b + COLS_B;
 
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
     // Reset accumulators
     float c00 = 0.0f;
     float c01 = 0.0f;
@@ -406,7 +309,7 @@
     float c32 = 0.0f;
     float c33 = 0.0f;
 
-    for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16)
+    for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += (16 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (16 * MULT_TRANSPOSE1XW_WIDTH))
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
         float4 a0 = vload4(0, src_addr_a);
@@ -433,8 +336,8 @@
         c33 = fma(a0.s3, b0.s3, c33);
 
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a + 4);
-        b0 = vload4(0, src_addr_b + 4);
+        a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+        b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
 
         c00 = fma(a0.s0, b0.s0, c00);
         c01 = fma(a0.s0, b0.s1, c01);
@@ -457,8 +360,8 @@
         c33 = fma(a0.s3, b0.s3, c33);
 
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a + 8);
-        b0 = vload4(0, src_addr_b + 8);
+        a0 = vload4(0, src_addr_a + 8 * MULT_INTERLEAVE4X4_HEIGHT);
+        b0 = vload4(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
 
         c00 = fma(a0.s0, b0.s0, c00);
         c01 = fma(a0.s0, b0.s1, c01);
@@ -481,8 +384,8 @@
         c33 = fma(a0.s3, b0.s3, c33);
 
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a + 12);
-        b0 = vload4(0, src_addr_b + 12);
+        a0 = vload4(0, src_addr_a + 12 * MULT_INTERLEAVE4X4_HEIGHT);
+        b0 = vload4(0, src_addr_b + 12 * MULT_TRANSPOSE1XW_WIDTH);
 
         c00 = fma(a0.s0, b0.s0, c00);
         c01 = fma(a0.s0, b0.s1, c01);
@@ -505,7 +408,7 @@
         c33 = fma(a0.s3, b0.s3, c33);
     }
 
-    for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4)
+    for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * MULT_TRANSPOSE1XW_WIDTH))
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
         float4 a0 = vload4(0, src_addr_a);
@@ -555,8 +458,6 @@
     c33 = c33 * ALPHA;
 #endif // defined(ALPHA)
 
-    barrier(CLK_GLOBAL_MEM_FENCE);
-
     // Store 4x4 block
     vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));
     vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));
@@ -568,7 +469,9 @@
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
  *
- * @attention The number of matrix B columns and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -584,30 +487,32 @@
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
 __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
                                                  IMAGE_DECLARATION(src1),
                                                  IMAGE_DECLARATION(dst))
 {
-    // src_addr.s0 = address of matrix A
-    // src_addr.s1 = address of matrix B
+    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
 
-    // Compute address for matrix A and B
-    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
-                                                                        (src1_stride_y));
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
 
-    // Add offset_first_element_in_bytes
-    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Divide by 2 in order to get the src_addr in unit of half
-    src_addr = src_addr >> 1;
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    __global half *src_addr_a = (__global half *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+    __global half *src_addr_b = (__global half *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
 
     // Compute end row address for matrix B
-    int end_row_mtx_b = src_addr.s1 + COLS_B;
+    __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
 
     // Reset accumulators
     half8 c00 = 0.0f;
@@ -615,11 +520,11 @@
     half8 c20 = 0.0f;
     half8 c30 = 0.0f;
 
-    for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(8, 16))
+    for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
-        half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
 
         c00 += (half8)a0.s0 * b0;
         c10 += (half8)a0.s1 * b0;
@@ -627,8 +532,8 @@
         c30 += (half8)a0.s3 * b0;
 
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);
-        b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);
+        a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+        b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
 
         c00 += (half8)a0.s0 * b0;
         c10 += (half8)a0.s1 * b0;
@@ -636,11 +541,11 @@
         c30 += (half8)a0.s3 * b0;
     }
 
-    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
-        half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
 
         c00 += (half8)a0.s0 * b0;
         c10 += (half8)a0.s1 * b0;
@@ -671,7 +576,9 @@
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 8 bit fixed point precision
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
  *
- * @attention The number of matrix B columns, the optional alpha's value and fixed point position need to be passed at compile time using -DCOLS_B -DALPHA and -DFIXED_POINT_POSITION
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
  *
  * @note: ALPHA must be passed in 8 bit fixed point format
  *
@@ -689,27 +596,32 @@
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
 __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
                                                  IMAGE_DECLARATION(src1),
                                                  IMAGE_DECLARATION(dst))
 {
-    // src_addr.s0 = address of matrix A
-    // src_addr.s1 = address of matrix B
+    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
 
-    // Compute address for matrix A and B
-    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
-                                                                        (src1_stride_y));
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 16;
 
-    // Add offset_first_element_in_bytes
-    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    __global char *src_addr_a = src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    __global char *src_addr_b = src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes;
 
     // Compute end row address for matrix B
-    int end_row_mtx_b = src_addr.s1 + COLS_B;
+    __global char *src_end_addr_b = src_addr_b + COLS_B;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
 
     // Reset accumulators
     short8 c00 = 0.0f;
@@ -722,11 +634,11 @@
     short8 c31 = 0.0f;
 
     // This for loop performs 1 accumulation for each iteration
-    for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(4, 16))
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        char4  a0 = vload4(0, ((__global char *)src0_ptr) + src_addr.s0);
-        char16 b0 = vload16(0, ((__global char *)src1_ptr) + src_addr.s1);
+        char4  a0 = vload4(0, src_addr_a);
+        char16 b0 = vload16(0, src_addr_b);
 
         c00 = mlal_sat_qs8x8(c00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);
         c10 = mlal_sat_qs8x8(c10, (char8)a0.s1, b0.s01234567, FIXED_POINT_POSITION);
@@ -765,7 +677,9 @@
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 16 bit fixed point precision
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
  *
- * @attention The number of matrix B columns, the optional alpha's value and fixed point position need to be passed at compile time using -DCOLS_B -DALPHA and -DFIXED_POINT_POSITION
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
  *
  * @note: ALPHA must be passed in 16 bit fixed point format
  *
@@ -783,30 +697,32 @@
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
 __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
                                                   IMAGE_DECLARATION(src1),
                                                   IMAGE_DECLARATION(dst))
 {
-    // src_addr.s0 = address of matrix A
-    // src_addr.s1 = address of matrix B
+    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
 
-    // Compute address for matrix A and B
-    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
-                                                                        (src1_stride_y));
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
 
-    // Add offset_first_element_in_bytes
-    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Divide by 2 in order to get the src_addr in unit of short
-    src_addr = src_addr >> 1;
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    __global short *src_addr_a = (__global short *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+    __global short *src_addr_b = (__global short *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
 
     // Compute end row address for matrix B
-    int end_row_mtx_b = src_addr.s1 + COLS_B;
+    __global short *src_end_addr_b = src_addr_b + COLS_B;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
 
     // Reset accumulators
     int8 c00 = 0.0f;
@@ -815,11 +731,11 @@
     int8 c30 = 0.0f;
 
     // This for loop performs 1 accumulation for each iteration
-    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(4, 8))
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
     {
         /* Load values from matrix A (interleaved) and matrix B (transposed) */
-        short4 a0 = vload4(0, ((__global short *)src0_ptr) + src_addr.s0);
-        short8 b0 = vload8(0, ((__global short *)src1_ptr) + src_addr.s1);
+        short4 a0 = vload4(0, src_addr_a);
+        short8 b0 = vload8(0, src_addr_b);
 
         c00 = mlal_sat_qs16x8(c00, (short8)a0.s0, b0, FIXED_POINT_POSITION);
         c10 = mlal_sat_qs16x8(c10, (short8)a0.s1, b0, FIXED_POINT_POSITION);
@@ -850,7 +766,7 @@
     vstore8(c30_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
 }
 #endif // defined(FIXED_POINT_POSITION)
-#endif // defined(COLS_B)
+#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
 
 #if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
 #if defined(DATA_TYPE)
@@ -1741,7 +1657,7 @@
 #if defined(BETA)
 /** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
  *
- * @attention The beta's value need to be passed at compile time using -DBETA
+ * @note The beta's value need to be passed at compile time using -DBETA
  *
  * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -1778,7 +1694,7 @@
 
 /** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
  *
- * @attention The beta's value need to be passed at compile time using -DBETA
+ * @note The beta's value need to be passed at compile time using -DBETA
  *
  * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -1816,7 +1732,7 @@
 #if defined(FIXED_POINT_POSITION)
 /** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 8 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:
  *
- * @attention The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION
+ * @note The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION
  *
  * @note: BETA must be passed in 8 bit fixed point format
  *
@@ -1855,7 +1771,7 @@
 
 /** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 16 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:
  *
- * @attention The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION
+ * @note The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION
  *
  * @note: BETA must be passed in 16 bit fixed point format
  *
@@ -1897,9 +1813,9 @@
 #if defined(WIDTH_VECTOR_A)
 /** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
  *
- * @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
+ * @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
  *
- * @attention The input A and matrix B must not be reshaped
+ * @note The input A and matrix B must not be reshaped
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index a928813..5e144d7 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,13 @@
 #include "helpers.h"
 #include "helpers_asymm.h"
 
-#if defined(COLS_B)
+#if defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
+ *  Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication
  *
- * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
+ * @note The number of matrix B columns needs to be passed at compile time using -DCOLS_B: e.g. -DCOLS_B=1024
+ * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data type: QASYMM8
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -49,69 +51,370 @@
  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
-__kernel void gemmlowp_mm_interleaved_transposed(IMAGE_DECLARATION(src0),
-                                                 IMAGE_DECLARATION(src1),
-                                                 IMAGE_DECLARATION(dst))
+__kernel void gemmlowp_mm_interleaved_transposed_midgard(IMAGE_DECLARATION(src0),
+                                                         IMAGE_DECLARATION(src1),
+                                                         IMAGE_DECLARATION(dst))
 {
-    // src_addr.s0 = address of matrix A
-    // src_addr.s1 = address of matrix B
-    // Compute address for matrix A and B
-    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
-                                                                        (src1_stride_y));
+    int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
 
-    // Add offset_first_element_in_bytes
-    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    __global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+    __global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
 
     // Compute end row address for matrix B
-    int end_row_mtx_b = src_addr.s1 + COLS_B;
+    __global uchar *src_end_addr_b = src_addr_b + COLS_B;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
 
     // Reset accumulators
-    int16 c00 = 0;
-    int16 c10 = 0;
-    int16 c20 = 0;
-    int16 c30 = 0;
+    int4 c00 = 0;
+    int4 c10 = 0;
+    int4 c20 = 0;
+    int4 c30 = 0;
 
-    for(; src_addr.s1 <= (end_row_mtx_b - 32); src_addr += (int2)(8, 32))
+    for(; src_addr_b <= (src_end_addr_b - (int)(8 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * TRANSPOSE1XW_WIDTH_STEP)
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        int8 a0  = convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
-        int16 b0 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
+        int4 a0 = convert_int4(vload4(0, src_addr_a));
+        int4 b0 = convert_int4(vload4(0, src_addr_b));
 
-        c00 += (int16)a0.s0 * b0;
-        c10 += (int16)a0.s1 * b0;
-        c20 += (int16)a0.s2 * b0;
-        c30 += (int16)a0.s3 * b0;
+        c00 += (int4)a0.s0 * b0;
+        c10 += (int4)a0.s1 * b0;
+        c20 += (int4)a0.s2 * b0;
+        c30 += (int4)a0.s3 * b0;
 
-        int16 b1 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
+        a0 = convert_int4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
+        b0 = convert_int4(vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP));
 
-        c00 += (int16)a0.s4 * b1;
-        c10 += (int16)a0.s5 * b1;
-        c20 += (int16)a0.s6 * b1;
-        c30 += (int16)a0.s7 * b1;
+        c00 += (int4)a0.s0 * b0;
+        c10 += (int4)a0.s1 * b0;
+        c20 += (int4)a0.s2 * b0;
+        c30 += (int4)a0.s3 * b0;
     }
 
-    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
+    for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        int4 a0  = convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
-        int16 b0 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
+        int4 a0 = convert_int4(vload4(0, src_addr_a));
+        int4 b0 = convert_int4(vload4(0, src_addr_b));
 
-        c00 += (int16)a0.s0 * b0;
-        c10 += (int16)a0.s1 * b0;
-        c20 += (int16)a0.s2 * b0;
-        c30 += (int16)a0.s3 * b0;
+        c00 += (int4)a0.s0 * b0;
+        c10 += (int4)a0.s1 * b0;
+        c20 += (int4)a0.s2 * b0;
+        c30 += (int4)a0.s3 * b0;
     }
 
     // Compute destination address
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
-    // Store 4x16 block
-    vstore16(c00, 0, (__global int *)(offset(&dst, 0, 0)));
-    vstore16(c10, 0, (__global int *)(offset(&dst, 0, 1)));
-    vstore16(c20, 0, (__global int *)(offset(&dst, 0, 2)));
-    vstore16(c30, 0, (__global int *)(offset(&dst, 0, 3)));
+    // Store 4x4 block
+    vstore4(c00, 0, (__global int *)(offset(&dst, 0, 0)));
+    vstore4(c10, 0, (__global int *)(offset(&dst, 0, 1)));
+    vstore4(c20, 0, (__global int *)(offset(&dst, 0, 2)));
+    vstore4(c30, 0, (__global int *)(offset(&dst, 0, 3)));
 }
-#endif // defined(COLS_B)
+
+/** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication
+ *
+ * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
+ * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data type: QASYMM8
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data type: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: S32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemmlowp_mm_interleaved_transposed_bifrost(IMAGE_DECLARATION(src0),
+                                                         IMAGE_DECLARATION(src1),
+                                                         IMAGE_DECLARATION(dst))
+{
+    int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    __global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+    __global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
+
+    // Compute end row address for matrix B
+    __global uchar *src_end_addr_b = src_addr_b + COLS_B;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    uint c00 = 0;
+    uint c01 = 0;
+    uint c02 = 0;
+    uint c03 = 0;
+    uint c10 = 0;
+    uint c11 = 0;
+    uint c12 = 0;
+    uint c13 = 0;
+    uint c20 = 0;
+    uint c21 = 0;
+    uint c22 = 0;
+    uint c23 = 0;
+    uint c30 = 0;
+    uint c31 = 0;
+    uint c32 = 0;
+    uint c33 = 0;
+
+#if MULT_INTERLEAVE4X4_HEIGHT == 1
+    for(; src_addr_b <= (src_end_addr_b - (int)(32 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += (32 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (32 * TRANSPOSE1XW_WIDTH_STEP))
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        uchar16 a0 = vload16(0, src_addr_a);
+        uchar4  b0 = vload4(0, src_addr_b);
+
+        c00 += (ushort)a0.s0 * b0.s0;
+        c01 += (ushort)a0.s0 * b0.s1;
+        c02 += (ushort)a0.s0 * b0.s2;
+        c03 += (ushort)a0.s0 * b0.s3;
+
+        c10 += (ushort)a0.s1 * b0.s0;
+        c11 += (ushort)a0.s1 * b0.s1;
+        c12 += (ushort)a0.s1 * b0.s2;
+        c13 += (ushort)a0.s1 * b0.s3;
+
+        c20 += (ushort)a0.s2 * b0.s0;
+        c21 += (ushort)a0.s2 * b0.s1;
+        c22 += (ushort)a0.s2 * b0.s2;
+        c23 += (ushort)a0.s2 * b0.s3;
+
+        c30 += (ushort)a0.s3 * b0.s0;
+        c31 += (ushort)a0.s3 * b0.s1;
+        c32 += (ushort)a0.s3 * b0.s2;
+        c33 += (ushort)a0.s3 * b0.s3;
+
+        // Load values from matrix B (transposed)
+        b0 = vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP);
+
+        c00 += (ushort)a0.s4 * b0.s0;
+        c01 += (ushort)a0.s4 * b0.s1;
+        c02 += (ushort)a0.s4 * b0.s2;
+        c03 += (ushort)a0.s4 * b0.s3;
+
+        c10 += (ushort)a0.s5 * b0.s0;
+        c11 += (ushort)a0.s5 * b0.s1;
+        c12 += (ushort)a0.s5 * b0.s2;
+        c13 += (ushort)a0.s5 * b0.s3;
+
+        c20 += (ushort)a0.s6 * b0.s0;
+        c21 += (ushort)a0.s6 * b0.s1;
+        c22 += (ushort)a0.s6 * b0.s2;
+        c23 += (ushort)a0.s6 * b0.s3;
+
+        c30 += (ushort)a0.s7 * b0.s0;
+        c31 += (ushort)a0.s7 * b0.s1;
+        c32 += (ushort)a0.s7 * b0.s2;
+        c33 += (ushort)a0.s7 * b0.s3;
+
+        // Load values from matrix B (transposed)
+        b0 = vload4(0, src_addr_b + 8 * TRANSPOSE1XW_WIDTH_STEP);
+
+        c00 += (ushort)a0.s8 * b0.s0;
+        c01 += (ushort)a0.s8 * b0.s1;
+        c02 += (ushort)a0.s8 * b0.s2;
+        c03 += (ushort)a0.s8 * b0.s3;
+
+        c10 += (ushort)a0.s9 * b0.s0;
+        c11 += (ushort)a0.s9 * b0.s1;
+        c12 += (ushort)a0.s9 * b0.s2;
+        c13 += (ushort)a0.s9 * b0.s3;
+
+        c20 += (ushort)a0.sA * b0.s0;
+        c21 += (ushort)a0.sA * b0.s1;
+        c22 += (ushort)a0.sA * b0.s2;
+        c23 += (ushort)a0.sA * b0.s3;
+
+        c30 += (ushort)a0.sB * b0.s0;
+        c31 += (ushort)a0.sB * b0.s1;
+        c32 += (ushort)a0.sB * b0.s2;
+        c33 += (ushort)a0.sB * b0.s3;
+
+        // Load values from matrix B (transposed)
+        b0 = vload4(0, src_addr_b + 12 * TRANSPOSE1XW_WIDTH_STEP);
+
+        c00 += (ushort)a0.sC * b0.s0;
+        c01 += (ushort)a0.sC * b0.s1;
+        c02 += (ushort)a0.sC * b0.s2;
+        c03 += (ushort)a0.sC * b0.s3;
+
+        c10 += (ushort)a0.sD * b0.s0;
+        c11 += (ushort)a0.sD * b0.s1;
+        c12 += (ushort)a0.sD * b0.s2;
+        c13 += (ushort)a0.sD * b0.s3;
+
+        c20 += (ushort)a0.sE * b0.s0;
+        c21 += (ushort)a0.sE * b0.s1;
+        c22 += (ushort)a0.sE * b0.s2;
+        c23 += (ushort)a0.sE * b0.s3;
+
+        c30 += (ushort)a0.sF * b0.s0;
+        c31 += (ushort)a0.sF * b0.s1;
+        c32 += (ushort)a0.sF * b0.s2;
+        c33 += (ushort)a0.sF * b0.s3;
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload16(0, src_addr_a + 16);
+        b0 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);
+
+        c00 += (ushort)a0.s0 * b0.s0;
+        c01 += (ushort)a0.s0 * b0.s1;
+        c02 += (ushort)a0.s0 * b0.s2;
+        c03 += (ushort)a0.s0 * b0.s3;
+
+        c10 += (ushort)a0.s1 * b0.s0;
+        c11 += (ushort)a0.s1 * b0.s1;
+        c12 += (ushort)a0.s1 * b0.s2;
+        c13 += (ushort)a0.s1 * b0.s3;
+
+        c20 += (ushort)a0.s2 * b0.s0;
+        c21 += (ushort)a0.s2 * b0.s1;
+        c22 += (ushort)a0.s2 * b0.s2;
+        c23 += (ushort)a0.s2 * b0.s3;
+
+        c30 += (ushort)a0.s3 * b0.s0;
+        c31 += (ushort)a0.s3 * b0.s1;
+        c32 += (ushort)a0.s3 * b0.s2;
+        c33 += (ushort)a0.s3 * b0.s3;
+
+        // Load values from matrix B (transposed)
+        b0 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);
+
+        c00 += (ushort)a0.s4 * b0.s0;
+        c01 += (ushort)a0.s4 * b0.s1;
+        c02 += (ushort)a0.s4 * b0.s2;
+        c03 += (ushort)a0.s4 * b0.s3;
+
+        c10 += (ushort)a0.s5 * b0.s0;
+        c11 += (ushort)a0.s5 * b0.s1;
+        c12 += (ushort)a0.s5 * b0.s2;
+        c13 += (ushort)a0.s5 * b0.s3;
+
+        c20 += (ushort)a0.s6 * b0.s0;
+        c21 += (ushort)a0.s6 * b0.s1;
+        c22 += (ushort)a0.s6 * b0.s2;
+        c23 += (ushort)a0.s6 * b0.s3;
+
+        c30 += (ushort)a0.s7 * b0.s0;
+        c31 += (ushort)a0.s7 * b0.s1;
+        c32 += (ushort)a0.s7 * b0.s2;
+        c33 += (ushort)a0.s7 * b0.s3;
+
+        // Load values from matrix B (transposed)
+        b0 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);
+
+        c00 += (ushort)a0.s8 * b0.s0;
+        c01 += (ushort)a0.s8 * b0.s1;
+        c02 += (ushort)a0.s8 * b0.s2;
+        c03 += (ushort)a0.s8 * b0.s3;
+
+        c10 += (ushort)a0.s9 * b0.s0;
+        c11 += (ushort)a0.s9 * b0.s1;
+        c12 += (ushort)a0.s9 * b0.s2;
+        c13 += (ushort)a0.s9 * b0.s3;
+
+        c20 += (ushort)a0.sA * b0.s0;
+        c21 += (ushort)a0.sA * b0.s1;
+        c22 += (ushort)a0.sA * b0.s2;
+        c23 += (ushort)a0.sA * b0.s3;
+
+        c30 += (ushort)a0.sB * b0.s0;
+        c31 += (ushort)a0.sB * b0.s1;
+        c32 += (ushort)a0.sB * b0.s2;
+        c33 += (ushort)a0.sB * b0.s3;
+
+        // Load values from matrix B (transposed)
+        b0 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);
+
+        c00 += (ushort)a0.sC * b0.s0;
+        c01 += (ushort)a0.sC * b0.s1;
+        c02 += (ushort)a0.sC * b0.s2;
+        c03 += (ushort)a0.sC * b0.s3;
+
+        c10 += (ushort)a0.sD * b0.s0;
+        c11 += (ushort)a0.sD * b0.s1;
+        c12 += (ushort)a0.sD * b0.s2;
+        c13 += (ushort)a0.sD * b0.s3;
+
+        c20 += (ushort)a0.sE * b0.s0;
+        c21 += (ushort)a0.sE * b0.s1;
+        c22 += (ushort)a0.sE * b0.s2;
+        c23 += (ushort)a0.sE * b0.s3;
+
+        c30 += (ushort)a0.sF * b0.s0;
+        c31 += (ushort)a0.sF * b0.s1;
+        c32 += (ushort)a0.sF * b0.s2;
+        c33 += (ushort)a0.sF * b0.s3;
+    }
+#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
+
+    for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        uchar4 a0 = vload4(0, src_addr_a);
+        uchar4 b0 = vload4(0, src_addr_b);
+
+        c00 += (ushort)a0.s0 * b0.s0;
+        c01 += (ushort)a0.s0 * b0.s1;
+        c02 += (ushort)a0.s0 * b0.s2;
+        c03 += (ushort)a0.s0 * b0.s3;
+
+        c10 += (ushort)a0.s1 * b0.s0;
+        c11 += (ushort)a0.s1 * b0.s1;
+        c12 += (ushort)a0.s1 * b0.s2;
+        c13 += (ushort)a0.s1 * b0.s3;
+
+        c20 += (ushort)a0.s2 * b0.s0;
+        c21 += (ushort)a0.s2 * b0.s1;
+        c22 += (ushort)a0.s2 * b0.s2;
+        c23 += (ushort)a0.s2 * b0.s3;
+
+        c30 += (ushort)a0.s3 * b0.s0;
+        c31 += (ushort)a0.s3 * b0.s1;
+        c32 += (ushort)a0.s3 * b0.s2;
+        c33 += (ushort)a0.s3 * b0.s3;
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Store 4x4 block
+    vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(offset(&dst, 0, 0)));
+    vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(offset(&dst, 0, 1)));
+    vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(offset(&dst, 0, 2)));
+    vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(offset(&dst, 0, 3)));
+}
+#endif // defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)
 
 #if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
 #define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)
@@ -140,9 +443,9 @@
  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
-__kernel void gemmlowp_mm(IMAGE_DECLARATION(src0),
-                          IMAGE_DECLARATION(src1),
-                          IMAGE_DECLARATION(dst))
+__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),
+                                  IMAGE_DECLARATION(src1),
+                                  IMAGE_DECLARATION(dst))
 {
     int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
 
@@ -167,6 +470,9 @@
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
     VECTOR_UINT acc3 = 0;
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    VECTOR_UINT acc4 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
 
     for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
     {
@@ -181,6 +487,9 @@
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
         uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+        uchar2 a4 = vload2(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
         // Load values from matrix B
         VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
         VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);
@@ -200,6 +509,10 @@
         acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;
         acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+        acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4.s0;
+        acc4 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a4.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
     }
 
     for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
@@ -215,6 +528,9 @@
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
         uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+        uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
         // Load values from matrix B
         VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
 
@@ -229,6 +545,9 @@
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
         acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+        acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
     }
 
     // Compute destination address
@@ -249,6 +568,355 @@
     VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
     (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 3)));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 4)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+}
+
+/** OpenCL kernel optimized for Bifrost architectures that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data type: QASYMM8
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data type: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: S32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemmlowp_mm_bifrost(IMAGE_DECLARATION(src0),
+                                  IMAGE_DECLARATION(src1),
+                                  IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx;
+
+    int end_row_vec_a = src_addr.s0 + COLS_A;
+
+    uint acc00 = 0;
+    uint acc01 = 0;
+    uint acc02 = 0;
+    uint acc03 = 0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    uint acc10 = 0;
+    uint acc11 = 0;
+    uint acc12 = 0;
+    uint acc13 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    uint acc20 = 0;
+    uint acc21 = 0;
+    uint acc22 = 0;
+    uint acc23 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    uint acc30 = 0;
+    uint acc31 = 0;
+    uint acc32 = 0;
+    uint acc33 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    uint acc40 = 0;
+    uint acc41 = 0;
+    uint acc42 = 0;
+    uint acc43 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+
+    for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))
+    {
+        // Load values from matrix A
+        uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+        uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+        // Load values from matrix B
+        uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
+        uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
+        uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
+        uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
+
+        {
+            // Accumulate
+            ushort tmp0 = (ushort)b0.s0 * (ushort)a0.s0;
+            ushort tmp1 = (ushort)b0.s1 * (ushort)a0.s0;
+            ushort tmp2 = (ushort)b0.s2 * (ushort)a0.s0;
+            ushort tmp3 = (ushort)b0.s3 * (ushort)a0.s0;
+
+            ushort tmp4 = (ushort)b1.s0 * (ushort)a0.s1;
+            ushort tmp5 = (ushort)b1.s1 * (ushort)a0.s1;
+            ushort tmp6 = (ushort)b1.s2 * (ushort)a0.s1;
+            ushort tmp7 = (ushort)b1.s3 * (ushort)a0.s1;
+
+            ushort tmp8 = (ushort)b2.s0 * (ushort)a0.s2;
+            ushort tmp9 = (ushort)b2.s1 * (ushort)a0.s2;
+            ushort tmpA = (ushort)b2.s2 * (ushort)a0.s2;
+            ushort tmpB = (ushort)b2.s3 * (ushort)a0.s2;
+
+            ushort tmpC = (ushort)b3.s0 * (ushort)a0.s3;
+            ushort tmpD = (ushort)b3.s1 * (ushort)a0.s3;
+            ushort tmpE = (ushort)b3.s2 * (ushort)a0.s3;
+            ushort tmpF = (ushort)b3.s3 * (ushort)a0.s3;
+
+            acc00 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
+            acc01 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
+            acc02 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
+            acc03 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
+        }
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        {
+            // Accumulate
+            ushort tmp0 = (ushort)b0.s0 * (ushort)a1.s0;
+            ushort tmp1 = (ushort)b0.s1 * (ushort)a1.s0;
+            ushort tmp2 = (ushort)b0.s2 * (ushort)a1.s0;
+            ushort tmp3 = (ushort)b0.s3 * (ushort)a1.s0;
+
+            ushort tmp4 = (ushort)b1.s0 * (ushort)a1.s1;
+            ushort tmp5 = (ushort)b1.s1 * (ushort)a1.s1;
+            ushort tmp6 = (ushort)b1.s2 * (ushort)a1.s1;
+            ushort tmp7 = (ushort)b1.s3 * (ushort)a1.s1;
+
+            ushort tmp8 = (ushort)b2.s0 * (ushort)a1.s2;
+            ushort tmp9 = (ushort)b2.s1 * (ushort)a1.s2;
+            ushort tmpA = (ushort)b2.s2 * (ushort)a1.s2;
+            ushort tmpB = (ushort)b2.s3 * (ushort)a1.s2;
+
+            ushort tmpC = (ushort)b3.s0 * (ushort)a1.s3;
+            ushort tmpD = (ushort)b3.s1 * (ushort)a1.s3;
+            ushort tmpE = (ushort)b3.s2 * (ushort)a1.s3;
+            ushort tmpF = (ushort)b3.s3 * (ushort)a1.s3;
+
+            acc10 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
+            acc11 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
+            acc12 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
+            acc13 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
+        }
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        {
+            // Accumulate
+            ushort tmp0 = (ushort)b0.s0 * (ushort)a2.s0;
+            ushort tmp1 = (ushort)b0.s1 * (ushort)a2.s0;
+            ushort tmp2 = (ushort)b0.s2 * (ushort)a2.s0;
+            ushort tmp3 = (ushort)b0.s3 * (ushort)a2.s0;
+
+            ushort tmp4 = (ushort)b1.s0 * (ushort)a2.s1;
+            ushort tmp5 = (ushort)b1.s1 * (ushort)a2.s1;
+            ushort tmp6 = (ushort)b1.s2 * (ushort)a2.s1;
+            ushort tmp7 = (ushort)b1.s3 * (ushort)a2.s1;
+
+            ushort tmp8 = (ushort)b2.s0 * (ushort)a2.s2;
+            ushort tmp9 = (ushort)b2.s1 * (ushort)a2.s2;
+            ushort tmpA = (ushort)b2.s2 * (ushort)a2.s2;
+            ushort tmpB = (ushort)b2.s3 * (ushort)a2.s2;
+
+            ushort tmpC = (ushort)b3.s0 * (ushort)a2.s3;
+            ushort tmpD = (ushort)b3.s1 * (ushort)a2.s3;
+            ushort tmpE = (ushort)b3.s2 * (ushort)a2.s3;
+            ushort tmpF = (ushort)b3.s3 * (ushort)a2.s3;
+
+            acc20 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
+            acc21 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
+            acc22 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
+            acc23 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
+        }
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        {
+            // Accumulate
+            ushort tmp0 = (ushort)b0.s0 * (ushort)a3.s0;
+            ushort tmp1 = (ushort)b0.s1 * (ushort)a3.s0;
+            ushort tmp2 = (ushort)b0.s2 * (ushort)a3.s0;
+            ushort tmp3 = (ushort)b0.s3 * (ushort)a3.s0;
+
+            ushort tmp4 = (ushort)b1.s0 * (ushort)a3.s1;
+            ushort tmp5 = (ushort)b1.s1 * (ushort)a3.s1;
+            ushort tmp6 = (ushort)b1.s2 * (ushort)a3.s1;
+            ushort tmp7 = (ushort)b1.s3 * (ushort)a3.s1;
+
+            ushort tmp8 = (ushort)b2.s0 * (ushort)a3.s2;
+            ushort tmp9 = (ushort)b2.s1 * (ushort)a3.s2;
+            ushort tmpA = (ushort)b2.s2 * (ushort)a3.s2;
+            ushort tmpB = (ushort)b2.s3 * (ushort)a3.s2;
+
+            ushort tmpC = (ushort)b3.s0 * (ushort)a3.s3;
+            ushort tmpD = (ushort)b3.s1 * (ushort)a3.s3;
+            ushort tmpE = (ushort)b3.s2 * (ushort)a3.s3;
+            ushort tmpF = (ushort)b3.s3 * (ushort)a3.s3;
+
+            acc30 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
+            acc31 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
+            acc32 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
+            acc33 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
+        }
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+        {
+            // Accumulate
+            ushort tmp0 = (ushort)b0.s0 * (ushort)a4.s0;
+            ushort tmp1 = (ushort)b0.s1 * (ushort)a4.s0;
+            ushort tmp2 = (ushort)b0.s2 * (ushort)a4.s0;
+            ushort tmp3 = (ushort)b0.s3 * (ushort)a4.s0;
+
+            ushort tmp4 = (ushort)b1.s0 * (ushort)a4.s1;
+            ushort tmp5 = (ushort)b1.s1 * (ushort)a4.s1;
+            ushort tmp6 = (ushort)b1.s2 * (ushort)a4.s1;
+            ushort tmp7 = (ushort)b1.s3 * (ushort)a4.s1;
+
+            ushort tmp8 = (ushort)b2.s0 * (ushort)a4.s2;
+            ushort tmp9 = (ushort)b2.s1 * (ushort)a4.s2;
+            ushort tmpA = (ushort)b2.s2 * (ushort)a4.s2;
+            ushort tmpB = (ushort)b2.s3 * (ushort)a4.s2;
+
+            ushort tmpC = (ushort)b3.s0 * (ushort)a4.s3;
+            ushort tmpD = (ushort)b3.s1 * (ushort)a4.s3;
+            ushort tmpE = (ushort)b3.s2 * (ushort)a4.s3;
+            ushort tmpF = (ushort)b3.s3 * (ushort)a4.s3;
+
+            acc40 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);
+            acc41 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);
+            acc42 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);
+            acc43 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);
+        }
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
+    {
+        // Load values from matrix A
+        uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+        uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+        // Load values from matrix B
+        uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);
+
+        // Accumulate
+        {
+            // Accumulate
+            ushort tmp0 = (ushort)b0.s0 * (ushort)a0;
+            ushort tmp1 = (ushort)b0.s1 * (ushort)a0;
+            ushort tmp2 = (ushort)b0.s2 * (ushort)a0;
+            ushort tmp3 = (ushort)b0.s3 * (ushort)a0;
+
+            acc00 += ((uint)tmp0);
+            acc01 += ((uint)tmp1);
+            acc02 += ((uint)tmp2);
+            acc03 += ((uint)tmp3);
+        }
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        {
+            // Accumulate
+            ushort tmp0 = (ushort)b0.s0 * (ushort)a1;
+            ushort tmp1 = (ushort)b0.s1 * (ushort)a1;
+            ushort tmp2 = (ushort)b0.s2 * (ushort)a1;
+            ushort tmp3 = (ushort)b0.s3 * (ushort)a1;
+
+            acc10 += ((uint)tmp0);
+            acc11 += ((uint)tmp1);
+            acc12 += ((uint)tmp2);
+            acc13 += ((uint)tmp3);
+        }
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        {
+            // Accumulate
+            ushort tmp0 = (ushort)b0.s0 * (ushort)a2;
+            ushort tmp1 = (ushort)b0.s1 * (ushort)a2;
+            ushort tmp2 = (ushort)b0.s2 * (ushort)a2;
+            ushort tmp3 = (ushort)b0.s3 * (ushort)a2;
+
+            acc20 += ((uint)tmp0);
+            acc21 += ((uint)tmp1);
+            acc22 += ((uint)tmp2);
+            acc23 += ((uint)tmp3);
+        }
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        {
+            // Accumulate
+            ushort tmp0 = (ushort)b0.s0 * (ushort)a3;
+            ushort tmp1 = (ushort)b0.s1 * (ushort)a3;
+            ushort tmp2 = (ushort)b0.s2 * (ushort)a3;
+            ushort tmp3 = (ushort)b0.s3 * (ushort)a3;
+
+            acc30 += ((uint)tmp0);
+            acc31 += ((uint)tmp1);
+            acc32 += ((uint)tmp2);
+            acc33 += ((uint)tmp3);
+        }
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+        {
+            // Accumulate
+            ushort tmp0 = (ushort)b0.s0 * (ushort)a4;
+            ushort tmp1 = (ushort)b0.s1 * (ushort)a4;
+            ushort tmp2 = (ushort)b0.s2 * (ushort)a4;
+            ushort tmp3 = (ushort)b0.s3 * (ushort)a4;
+
+            acc40 += ((uint)tmp0);
+            acc41 += ((uint)tmp1);
+            acc42 += ((uint)tmp2);
+            acc43 += ((uint)tmp3);
+        }
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Store the result
+    vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(offset(&dst, 0, 0)));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(offset(&dst, 0, 1)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(offset(&dst, 0, 2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(offset(&dst, 0, 3)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(offset(&dst, 0, 4)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
 }
 #endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
 
@@ -423,39 +1091,39 @@
 {
     Tensor3D mm_result = CONVERT_TO_TENSOR3D_STRUCT(mm_result);
 
-    int16 a_offset_s32 = (int16)0;
-    int16 b_offset_s32 = (int16)0;
+    int4 a_offset_s32 = (int4)0;
+    int4 b_offset_s32 = (int4)0;
 
 #if defined(A_OFFSET)
     Image sum_col = CONVERT_TO_IMAGE_STRUCT(sum_col);
 
     // Compute the offset contribution due to A_OFFSET
 #if defined(SUM_COL_HAS_BATCHES)
-    a_offset_s32 = vload16(0, (__global int *)(sum_col.ptr + get_global_id(2) * sum_col_stride_y));
+    a_offset_s32 = vload4(0, (__global int *)(sum_col.ptr + get_global_id(2) * sum_col_stride_y));
 #else  // defined(MATRIX_B_HAS_BATCHES)
-    a_offset_s32 = vload16(0, (__global int *)(sum_col.ptr));
+    a_offset_s32 = vload4(0, (__global int *)(sum_col.ptr));
 #endif // defined(MATRIX_B_HAS_BATCHES)
 
-    a_offset_s32 *= (int16)A_OFFSET;
+    a_offset_s32 *= (int4)A_OFFSET;
 #endif // defined(A_OFFSET)
 
 #if defined(B_OFFSET)
     Image sum_row = CONVERT_TO_IMAGE_STRUCT(sum_row);
 
     // Compute the offset contribution due to B_OFFSET
-    b_offset_s32 = (int16) * (((__global int *)(sum_row.ptr + get_global_id(2) * sum_row_stride_y)) + get_global_id(1));
-    b_offset_s32 *= (int16)B_OFFSET;
+    b_offset_s32 = (int4) * (((__global int *)(sum_row.ptr + get_global_id(2) * sum_row_stride_y)) + get_global_id(1));
+    b_offset_s32 *= (int4)B_OFFSET;
 #endif // defined(B_OFFSET)
 
-    const int16 offset_term_s32 = (int16)K_OFFSET + a_offset_s32 + b_offset_s32;
+    const int4 offset_term_s32 = (int4)K_OFFSET + a_offset_s32 + b_offset_s32;
 
-    int16 in_s32 = vload16(0, (__global int *)mm_result.ptr);
+    int4 in_s32 = vload4(0, (__global int *)mm_result.ptr);
 
     // Add the offset terms to GEMM's result
     in_s32 += offset_term_s32;
 
     // Store the result with the offset contribution
-    vstore16(in_s32, 0, (__global int *)mm_result.ptr);
+    vstore4(in_s32, 0, (__global int *)mm_result.ptr);
 }
 #endif // defined(K_OFFSET)
 

diff --git a/src/core/CL/cl_kernels/gemv.cl b/src/core/CL/cl_kernels/gemv.cl
index 3e38c73..811aa1b 100644
--- a/src/core/CL/cl_kernels/gemv.cl
+++ b/src/core/CL/cl_kernels/gemv.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "helpers.h"
 
+#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
 /** This kernel applies dot product to each plane on the input tensor and the corrispective column of the reshaped weight tensor.
  *
  * @note Datatype and source width and height should be given as a preprocessor argument using -DDATA_TYPE=type, -DSRC_WIDTH=width and -DSRC_HEIGHT=height. e.g. -DDATA_TYPE=short
@@ -109,3 +110,91 @@
         }
     }
 }
+#endif /* defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) */
+
+#if defined(SRC_WIDTH) && defined(SRC_HEIGHT)
+/** This kernel applies dot product to each plane on the input tensor and the corresponding column of the reshaped weight tensor.
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  input_offset                          Input's quantization offset
+ * @param[in]  weights_offset                        Weights's quantization offset
+ */
+__kernel void gemm_mv_quantized(TENSOR3D_DECLARATION(src),
+                                IMAGE_DECLARATION(weights),
+                                VECTOR_DECLARATION(dst),
+                                const int input_offset,
+                                const int weights_offset)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    int y = get_global_id(1) * 4;
+    int z = get_global_id(2);
+
+    __global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;
+    __global uchar *input_ptr       = src.ptr;
+
+    int acc0 = 0;
+    int acc1 = 0;
+    int acc2 = 0;
+    int acc3 = 0;
+
+    // This kernel handle 4 rows in per thread so that it can reuse the weights
+    for(int i = 0; i < SRC_WIDTH; i += 4)
+    {
+        int4 w = convert_int4(vload4(0, (__global uchar *)(current_weights + i * weights_stride_x))) + (int4)weights_offset;
+
+        int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
+
+        int4 tmp0 = convert_int4(vload4(0, (__global uchar *)(input_ptr + offset.s0))) + (int4)input_offset;
+        int4 tmp1 = convert_int4(vload4(0, (__global uchar *)(input_ptr + offset.s1))) + (int4)input_offset;
+        int4 tmp2 = convert_int4(vload4(0, (__global uchar *)(input_ptr + offset.s2))) + (int4)input_offset;
+        int4 tmp3 = convert_int4(vload4(0, (__global uchar *)(input_ptr + offset.s3))) + (int4)input_offset;
+
+        // Accumulate
+        acc0 += tmp0.s0 * w.s0 + tmp0.s1 * w.s1 + tmp0.s2 * w.s2 + tmp0.s3 * w.s3;
+        acc1 += tmp1.s0 * w.s0 + tmp1.s1 * w.s1 + tmp1.s2 * w.s2 + tmp1.s3 * w.s3;
+        acc2 += tmp2.s0 * w.s0 + tmp2.s1 * w.s1 + tmp2.s2 * w.s2 + tmp2.s3 * w.s3;
+        acc3 += tmp3.s0 * w.s0 + tmp3.s1 * w.s1 + tmp3.s2 * w.s2 + tmp3.s3 * w.s3;
+    }
+
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;
+
+    int rows_left = SRC_HEIGHT - (y + 4);
+
+    // This if check is used to handle the last few rows when it can't be divided by the four
+    if(rows_left >= 0)
+    {
+        vstore4((int4)(acc0, acc1, acc2, acc3), 0, (__global int *)output_ptr);
+    }
+    else
+    {
+        switch(rows_left)
+        {
+            case -1: // three rows left; one is padding
+                *((__global int *)(output_ptr + 2 * dst_stride_x)) = acc2;
+            case -2: // two rows left; two are padding
+                *((__global int *)(output_ptr + 1 * dst_stride_x)) = acc1;
+            case -3: // one row left; three are padding
+                *((__global int *)(output_ptr + 0 * dst_stride_x)) = acc0;
+                break;
+        }
+    }
+}
+#endif /* defined(SRC_WIDTH) && defined(SRC_HEIGHT) */

diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index b44d0f1..a69bcc1 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,12 +44,6 @@
         return (x >> exponent) + select(zero, one, (x & mask) > threshold);                                        \
     }
 
-ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
-ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
-ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
-
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
-
 /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
  * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
  *
@@ -57,7 +51,7 @@
  *
  * @return Product of two fixed-point numbers.
  */
-#define ASYMM_MULT_IMP(size)                                                                                 \
+#define ASYMM_MULT_IMPL(size)                                                                                \
     inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
     {                                                                                                        \
         VEC_DATA_TYPE(int, size)                                                                             \
@@ -68,26 +62,308 @@
         b_64 = convert_long##size(b);                                                                        \
         VEC_DATA_TYPE(long, size)                                                                            \
         ab_64 = a_64 * b_64;                                                                                 \
-        VEC_DATA_TYPE(long, size)                                                                            \
-        mask1 = 1 << 30;                                                                                     \
-        VEC_DATA_TYPE(long, size)                                                                            \
-        mask2 = 1 - (1 << 30);                                                                               \
-        VEC_DATA_TYPE(long, size)                                                                            \
-        nudge = select(mask2, mask1, ab_64 >= 0);                                                            \
-        VEC_DATA_TYPE(long, size)                                                                            \
-        mask = 1ll << 31;                                                                                    \
         VEC_DATA_TYPE(int, size)                                                                             \
-        ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                                            \
+        ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));                                       \
         return select(ab_x2_high32, INT_MAX, overflow);                                                      \
     }
 
-ASYMM_MULT_IMP(2)
-ASYMM_MULT_IMP(8)
-ASYMM_MULT_IMP(16)
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                                               \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
+        const int k_fractional_bits = 31;                                                                                           \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x2 = ASYMM_MULT(x, x, size);                                                                                                \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
+        VEC_DATA_TYPE(int, size)                                                                                                    \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+    }
 
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
+ */
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                                                                                     \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                    \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
+        return select(all_zeros, all_ones, a == 0);                                      \
+    }
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                        \
+        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
+        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
+        return select(all_zeros, all_ones, a != 0);                                          \
+    }
+
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
+    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
+    {                                                                                                                                                                                                         \
+        if(k_integer_bits > exponent)                                                                                                                                                                         \
+        {                                                                                                                                                                                                     \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
+            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
+                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
+                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
+        }                                                                                                                                                                                                     \
+        \
+        return result;                                                                                                                                                                                        \
+    }
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
+    {                                                                                                                         \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        mask = k_one_quarter - 1;                                                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
+        VEC_DATA_TYPE(int, size)                                                                                              \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
+        \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
+        \
+        if(k_integer_bits > 5)                                                                                                \
+        {                                                                                                                     \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
+        }                                                                                                                     \
+        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+    }
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Arithmetic left or right shift.
+ */
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
+    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                                                      \
+        if(exponent < 0)                                                                                                   \
+        {                                                                                                                  \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
+        }                                                                                                                  \
+        \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
+        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                           \
+        result = x << exponent;                                                                                            \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
+        return result;                                                                                                     \
+    }
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                                     \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        a64 = convert_long##size(a);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        b64 = convert_long##size(b);                                                                                      \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sum = a64 + b64;                                                                                                  \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                                         \
+        sign = select(minus_one, one, sum >= 0);                                                                          \
+        return convert_int##size((sum + sign) / 2);                                                                       \
+    }
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
+    {                                                                                                        \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
+        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
+        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
+        for(int i = 0; i < 3; i++)                                                                           \
+        {                                                                                                    \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
+            VEC_DATA_TYPE(int, size)                                                                         \
+            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
+            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
+        }                                                                                                    \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
+    }
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Rescaled value.
+ */
+#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
+    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                                               \
+        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+    }
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
 #define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
-
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
     ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+ASYMM_MULT_IMPL(2)
+ASYMM_MULT_IMPL(4)
+ASYMM_MULT_IMPL(8)
+ASYMM_MULT_IMPL(16)
+
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
+ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
+
+ASYMM_SELECT_USING_MASK_IMPL(2)
+ASYMM_SELECT_USING_MASK_IMPL(4)
+ASYMM_SELECT_USING_MASK_IMPL(8)
+ASYMM_SELECT_USING_MASK_IMPL(16)
+
+ASYMM_MASK_IF_ZERO_IMPL(2)
+ASYMM_MASK_IF_ZERO_IMPL(4)
+ASYMM_MASK_IF_ZERO_IMPL(8)
+ASYMM_MASK_IF_ZERO_IMPL(16)
+
+ASYMM_MASK_IF_NON_ZERO_IMPL(2)
+ASYMM_MASK_IF_NON_ZERO_IMPL(4)
+ASYMM_MASK_IF_NON_ZERO_IMPL(8)
+ASYMM_MASK_IF_NON_ZERO_IMPL(16)
+
+EXP_BARREL_SHIFTER_IMPL(2)
+EXP_BARREL_SHIFTER_IMPL(4)
+EXP_BARREL_SHIFTER_IMPL(8)
+EXP_BARREL_SHIFTER_IMPL(16)
+
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
+ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
+
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
+
+ASYMM_ROUNDING_HALF_SUM_IMPL(2)
+ASYMM_ROUNDING_HALF_SUM_IMPL(4)
+ASYMM_ROUNDING_HALF_SUM_IMPL(8)
+ASYMM_ROUNDING_HALF_SUM_IMPL(16)
+
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
+ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
+
+ASYMM_RESCALE_IMPL(2)
+ASYMM_RESCALE_IMPL(4)
+ASYMM_RESCALE_IMPL(8)
+ASYMM_RESCALE_IMPL(16)
 
 #endif // ARM_COMPUTE_HELPERS_ASYMM_H

diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl
new file mode 100644
index 0000000..75d99bd
--- /dev/null
+++ b/src/core/CL/cl_kernels/im2col.cl

@@ -0,0 +1,804 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(FIXED_POINT_POSITION)
+#include "fixed_point.h"
+#endif // FIXED_POINT_POSITION
+
+#if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
+#if !defined(FIXED_POINT_POSITION)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not support"
+#endif // ELEMENT_SIZE
+
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 1x1 and the stride_x = 1
+ *
+ * @note This kernel computes 4 elements
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col1x1_stridex1_dchw(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const uint xc    = get_global_id(0) * 4;            // x coordinate in the convolved tensor
+    const uint yc    = get_global_id(1);                // y coordinate in the convolved tensor
+    const uint ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
+    const uint batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+
+    // Clamp xc
+    // The strategy clamps at "xc" as it will be a valid value for sure
+    uint4 xc_clamped = xc + (uint4)(0, 1, 2, 3);
+
+    // Check which values are valid
+    const VEC_DATA_TYPE(COND_DATA_TYPE, 4) cond0 = CONVERT((xc_clamped < SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
+
+    xc_clamped = select((uint4)xc, xc_clamped, convert_int4(cond0));
+
+    // Calculate input indices
+    const uint xi = xc;
+    const uint yi = yc * STRIDE_Y;
+
+    // Calculate output indices
+    const uint  xo = ch;
+    const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    // Get input and output address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
+
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w;
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    data = vload4(0, (__global DATA_TYPE *)input_ptr);
+
+    // If out-of-bound, overwrite with the first element
+    data = select((VEC_DATA_TYPE(DATA_TYPE, 4))data.s0, data, cond0);
+
+    *(__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) = data.s0;
+    *(__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) = data.s1;
+    *(__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) = data.s2;
+    *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;
+
+#ifdef HAS_BIAS
+    if(ch == (KERNEL_DEPTH - 1))
+    {
+        *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;
+        *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;
+        *((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f;
+        *((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
+
+#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 3x3
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col3x3_dchw(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
+    const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X - PAD_LEFT;
+    const int yi = yc * STRIDE_Y - PAD_TOP;
+
+    // Calculate output indices
+    const int xo = ch * 9;                    // 3x3
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    // Get input and output address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
+
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+    // Put 0 if the value is out-of-bound
+    int3 x = (int3)xi + (int3)(0, 1, 2);
+    int3 y = (int3)yi + (int3)(0, 1, 2);
+
+    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
+    cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
+    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
+    cond1 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
+    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
+    cond2 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
+
+    row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);
+    row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1);
+    row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+    vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);
+    *((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
+
+#ifdef HAS_BIAS
+    if(ch == (KERNEL_DEPTH - 1))
+    {
+        *((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 5x5
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col5x5_dchw(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
+    const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X - PAD_LEFT;
+    const int yi = yc * STRIDE_Y - PAD_TOP;
+
+    // Calculate output indices
+    const int xo = ch * 25;                   // 5x5
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+    // Put 0 if the value is out-of-bound
+    int4 x0 = (int4)xi + (int4)(0, 1, 2, 3);
+    int4 y0 = (int4)yi + (int4)(0, 1, 2, 3);
+    int  x1 = xi + 4;
+    int  y1 = yi + 4;
+
+    // Check if we could have out-of-bounds elements in the x direction
+    VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+    x0_condition = CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
+    VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+    y0_condition                = CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
+    COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH);
+    COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+    // Get input and output address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
+
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row01 = *((__global DATA_TYPE *)input_ptr + 4);
+
+        input_ptr += src_stride_y;
+
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row11 = *((__global DATA_TYPE *)input_ptr + 4);
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0;
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond10                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1;
+        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0);
+        COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1);
+
+        // Replace with 0 if the value is not valid
+        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
+        row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
+        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
+        row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
+                                              row10.s012),
+                0, (__global DATA_TYPE *)output_ptr);
+        vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 10 * dst_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row01 = *((__global DATA_TYPE *)input_ptr + 4);
+
+        input_ptr += src_stride_y;
+
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row11 = *((__global DATA_TYPE *)input_ptr + 4);
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2;
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond10                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3;
+        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2);
+        COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3);
+
+        // Replace with 0 if the value is not valid
+        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
+        row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
+        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
+        row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
+                                              row10.s012),
+                0, (__global DATA_TYPE *)output_ptr);
+        vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 10 * dst_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row01 = *((__global DATA_TYPE *)input_ptr + 4);
+
+        input_ptr += src_stride_y;
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond00                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition;
+        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition);
+
+        // Replace with 0 if the value is not valid
+        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
+        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+        vstore4(row00, 0, (__global DATA_TYPE *)output_ptr);
+        *((__global DATA_TYPE *)output_ptr + 4) = row01;
+
+        output_ptr += 5 * dst_stride_x;
+    }
+
+#ifdef HAS_BIAS
+    if(ch == (KERNEL_DEPTH - 1))
+    {
+        *((__global DATA_TYPE *)output_ptr) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 11x11
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col11x11_padx0_pady0_dchw(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
+    const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X;
+    const int yi = yc * STRIDE_Y;
+
+    // Calculate output indices
+    const int xo = ch * 121;                  // 11x11
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    // Get input and output address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
+
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        output_ptr += 11 * src_stride_x;
+    }
+
+#ifdef HAS_BIAS
+    if(ch == (KERNEL_DEPTH - 1))
+    {
+        *((__global DATA_TYPE *)output_ptr) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
+#endif // !defined(FIXED_POINT_POSITION)
+
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+/** This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when
+ * the kernel width is greater than 1 (except when the kernel size is 3x3) and pad_x == pad_y == 0.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.
+ * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_padx0_pady0_dchw(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
+    const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X;
+    const int yi = yc * STRIDE_Y;
+    // Calculate output indices
+    const int xo                   = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+    const int yo                   = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+    __global uchar *input_ptr      = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+    // Linearize convolution elements
+    for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
+    {
+        int last_x = 0;
+        for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE)
+        {
+            VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+            row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+            VSTORE(VECTOR_SIZE)
+            (row, 0, output_ptr);
+            last_x = x;
+        }
+        // Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE).
+        // Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit.
+#if WIDTH_MOD_VECTOR_SIZE == 1
+        *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
+#elif WIDTH_MOD_VECTOR_SIZE > 1
+        VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE)
+        row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
+        VSTORE(WIDTH_MOD_VECTOR_SIZE)
+        (row, 0, output_ptr);
+#endif /* WIDTH_MOD_VECTOR_SIZE */
+        output_ptr += WIDTH_MOD_VECTOR_SIZE;
+    } /* End of loop over KERNEL_HEIGHT */
+
+#ifdef HAS_BIAS
+    if(ch == (KERNEL_DEPTH - 1))
+    {
+#ifdef FIXED_POINT_POSITION
+        *output_ptr = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
+#else  // FIXED_POINT_POSITION
+        *output_ptr = 1.0f;
+#endif // FIXED_POINT_POSITION
+    }
+#endif // HAS_BIAS
+}
+#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+
+#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DKERNEL_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DKERNEL_DEPTH=64
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_dchw(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
+    const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X - PAD_LEFT;
+    const int yi = yc * STRIDE_Y - PAD_TOP;
+
+    // Calculate output indices
+    const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    __global uchar *input_ptr      = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+
+    // Linearize convolution elements
+    for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
+    {
+        for(int x = xi, x_e = xi + KERNEL_WIDTH; x < x_e; ++x, ++output_ptr)
+        {
+#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+            *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+#else  // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+            if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
+            {
+                *output_ptr = PAD_VALUE;
+            }
+            else
+            {
+                *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+            }
+#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+        }
+    }
+
+#ifdef HAS_BIAS
+    if(ch == (KERNEL_DEPTH - 1))
+    {
+#ifdef FIXED_POINT_POSITION
+        *output_ptr = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
+#else  // FIXED_POINT_POSITION
+        *output_ptr = 1.0f;
+#endif // FIXED_POINT_POSITION
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+
+/**This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when
+ * the kernel width and height are the same of width and height of the input tensor
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note In case biases will be added in late stage, -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             The width of the input tensor
+ * @param[in]  height                            The height of the input tensor
+ */
+__kernel void im2col_reduced_dchw(
+    TENSOR3D_DECLARATION(src),
+    VECTOR_DECLARATION(dst),
+    uint width, uint height)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    const uint image_size = width * height;
+
+    __global uchar *tmp_out_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * width + get_global_id(2) * image_size) * dst_stride_x;
+
+    *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)src.ptr);
+
+#ifdef HAS_BIAS
+    // If it is the last thread in the 3 dimensional workgroup
+    if(get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1))
+    {
+        tmp_out_ptr += dst_stride_x;
+#ifdef FIXED_POINT_POSITION
+        *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
+#else  // FIXED_POINT_POSITION
+        *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)1.0f;
+#endif // FIXED_POINT_POSITION
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index ee8ff27..dae0b99 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -183,13 +183,13 @@
         res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s25, data01.s03));                                                   \
     })
 
-DATA_TYPE calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h,
+DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
                               const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
     int       start_x = get_global_id(0) * stride_x - pad_x;
     int       start_y = get_global_id(1) * stride_y - pad_y;
-    const int end_x   = min(start_x + pool_size, upper_bound_w);
-    const int end_y   = min(start_y + pool_size, upper_bound_h);
+    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
+    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
 #if defined(EXCLUDE_PADDING)
     start_x = max(0, start_x);
     start_y = max(0, start_y);
@@ -249,7 +249,7 @@
 
 #if defined(POOL_AVG) || defined(POOL_L2)
     // Divide by pool region in case of average or l2 pooling
-    res = DIV_OP(res, calculate_avg_scale(2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+    res = DIV_OP(res, calculate_avg_scale(2, 2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
 #endif /* defined(POOL_AVG) || defined(POOL_L2) */
 
 #if defined(POOL_L2)
@@ -317,7 +317,7 @@
 
 #if defined(POOL_AVG) || defined(POOL_L2)
     // Divide by pool region in case of average pooling
-    res = DIV_OP(res, calculate_avg_scale(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+    res = DIV_OP(res, calculate_avg_scale(3, 3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
 #endif /* defined(POOL_AVG) || defined(POOL_L2) */
 
 #if defined(POOL_L2)
@@ -403,7 +403,7 @@
 }
 #endif // defined(POOLING3x3) && !defined(FIXED_POINT_POSITION)
 
-#if defined(POOL_SIZE)
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
 
 // Set the initial value for the pooling operation accordingly with the data type
 #if defined(POOL_AVG) || defined(POOL_L2)
@@ -427,7 +427,7 @@
  *
  * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
  * @note -DFP16 must be passed at compile time if half float data type is used
- * @note Pool size must be passed using -DPOOL_SIZE e.g. -DPOOL_SIZE=13;
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
  * @note In case of average pooling the following information must be passed at compile time:
  *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
  *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
@@ -451,7 +451,7 @@
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  */
-__kernel void pooling_layer_N(
+__kernel void pooling_layer_MxN(
     TENSOR3D_DECLARATION(input),
     TENSOR3D_DECLARATION(output))
 {
@@ -464,10 +464,10 @@
     DATA_TYPE sdata = INITIAL_VALUE;
 
     // Load data
-    for(int y = 0; y < POOL_SIZE; y++)
+    for(int y = 0; y < POOL_SIZE_Y; y++)
     {
         int x = 0;
-        for(; x <= ((int)POOL_SIZE - 8); x += 8)
+        for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
         {
             VEC_DATA_TYPE(DATA_TYPE, 8)
             data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
@@ -479,7 +479,7 @@
         }
 
         // Leftover
-        for(; x < (int)POOL_SIZE; ++x)
+        for(; x < (int)POOL_SIZE_X; ++x)
         {
             DATA_TYPE data0 = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
 #if defined(POOL_L2)
@@ -500,7 +500,7 @@
 
 #if defined(POOL_AVG) || defined(POOL_L2)
     // Divide by pool region in case of average pooling
-    res = DIV_OP(res, calculate_avg_scale(POOL_SIZE, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+    res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
 #endif /* defined(POOL_AVG) || defined(POOL_L2) */
 
 #if defined(POOL_L2)
@@ -511,4 +511,4 @@
     // Store result
     *(__global DATA_TYPE *)output.ptr = res;
 }
-#endif // defined(POOL_SIZE)
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)

diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
index 39c2c22..98850c0 100644
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,13 +35,13 @@
 #error "L2 pooling is not supported"
 #endif /* defined(POOL_L2) */
 
-int calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h,
+int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
                         const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
     int       start_x = get_global_id(0) * stride_x - pad_x;
     int       start_y = get_global_id(1) * stride_y - pad_y;
-    const int end_x   = min(start_x + pool_size, upper_bound_w);
-    const int end_y   = min(start_y + pool_size, upper_bound_h);
+    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
+    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
 #if defined(EXCLUDE_PADDING)
     start_x = max(0, start_x);
     start_y = max(0, start_y);
@@ -51,7 +51,7 @@
 
 /** Performs a pooling function of pool size equal to N
  *
- * @note Pool size must be passed using -DPOOL_SIZE e.g. -DPOOL_SIZE=13;
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
  * @note In case of average pooling the following information must be passed at compile time:
  *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
  *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
@@ -75,7 +75,7 @@
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  */
-__kernel void pooling_layer_N_quantized(
+__kernel void pooling_layer_MxN_quantized(
     TENSOR3D_DECLARATION(input),
     TENSOR3D_DECLARATION(output))
 {
@@ -87,10 +87,10 @@
     int  sdata = 0;
 
     // Load data
-    for(int y = 0; y < POOL_SIZE; y++)
+    for(int y = 0; y < POOL_SIZE_Y; y++)
     {
         int x = 0;
-        for(; x <= ((int)POOL_SIZE - 8); x += 8)
+        for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
         {
             uchar8 data = vload8(0, (__global uchar *)tensor3D_offset(&input, x, y, 0));
             int8 data0  = convert_int8(data);
@@ -98,7 +98,7 @@
         }
 
         // Leftover
-        for(; x < (int)POOL_SIZE; ++x)
+        for(; x < (int)POOL_SIZE_X; ++x)
         {
             uchar data = *((__global uchar *)tensor3D_offset(&input, x, y, 0));
             int data0  = convert_int(data);
@@ -113,7 +113,7 @@
     res          = POOL_OP(res, sdata);
 
 #if defined(POOL_AVG)
-    res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)));
+    res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)));
 #endif /* defined(POOL_AVG) */
 
     // Store result

diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index d46a226..aa7403b 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,39 +70,46 @@
  * @param[in] src_ptr                                   Pointer to the source tensor. Supported data types: F32
  * @param[in] src_stride_x                              Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                                src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                              Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                                src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the source tensor
  * @param[in] partial_sum_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptt
- * @param[in] partial_sum_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in] partial_sum_stride_x                      Stride of the output tensor in X dimension (in bytes)
  * @param[in] partial_sum_step_x                        partial_sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] partial_sum_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] partial_sum_step_y                        partial_sum_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] local_sums                                Local buffer for storing the partioal sum
+ * @param[in] local_sums                                Local buffer for storing the partial sum
  */
 __kernel void reduction_operation(
-    VECTOR_DECLARATION(src),
-    VECTOR_DECLARATION(partial_sum),
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(partial_sum),
     __local DATA_TYPE *local_sums)
 {
-    Vector src         = CONVERT_TO_VECTOR_STRUCT(src);
-    Vector partial_sum = CONVERT_TO_VECTOR_STRUCT(partial_sum);
+    Image src         = CONVERT_TO_IMAGE_STRUCT(src);
+    Image partial_sum = CONVERT_TO_IMAGE_STRUCT(partial_sum);
 
     unsigned int lsize = get_local_size(0);
     unsigned int lid   = get_local_id(0);
 
-    local_sums[lid] = OPERATION((__global DATA_TYPE *)src.ptr);
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // Perform parallel reduction
-    for(unsigned int i = lsize >> 1; i > 0; i >>= 1)
+    for(unsigned int y = 0; y < get_local_size(1); ++y)
     {
-        if(lid < i)
-        {
-            local_sums[lid] += local_sums[lid + i];
-        }
+        local_sums[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
         barrier(CLK_LOCAL_MEM_FENCE);
-    }
 
-    if(lid == 0)
-    {
-        ((__global DATA_TYPE *)partial_sum.ptr + get_group_id(0))[0] = local_sums[0];
+        // Perform parallel reduction
+        for(unsigned int i = lsize >> 1; i > 0; i >>= 1)
+        {
+            if(lid < i)
+            {
+                local_sums[lid] += local_sums[lid + i];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        if(lid == 0)
+        {
+            ((__global DATA_TYPE *)offset(&partial_sum, get_group_id(0), y))[0] = local_sums[0];
+        }
     }
 }
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
index 5d8cd12..7fed879 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/softmax_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -87,178 +87,6 @@
 __constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 __constant uint4 idx4   = (uint4)(0, 1, 2, 3);
 
-/** Identifies the maximum value across the 1st dimension.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
- * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  width                             Input image width
- */
-__kernel void softmax_layer_max(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint width)
-{
-    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-
-    // Initialize local maximum
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    max_val = (VEC_DATA_TYPE(DATA_TYPE, 16))type_min;
-
-    // Calculate max of row
-    const uint width4 = width >> 4;
-    for(uint i = 0; i < width4; i++)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 16)
-        data    = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
-        max_val = MAX_OP(data, max_val, DATA_TYPE, 16);
-    }
-
-#ifdef NON_MULTIPLE_OF_16
-    // Handle non multiple of 16
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
-    VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)
-    widx    = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16));
-    max_val = MAX_OP(max_val, select(type_min, data, widx), DATA_TYPE, 16);
-#endif /* NON_MULTIPLE_OF_16 */
-
-    // Perform max reduction
-    max_val.s01234567 = MAX_OP(max_val.s01234567, max_val.s89ABCDEF, DATA_TYPE, 8);
-    max_val.s0123     = MAX_OP(max_val.s0123, max_val.s4567, DATA_TYPE, 4);
-    max_val.s01       = MAX_OP(max_val.s01, max_val.s23, DATA_TYPE, 2);
-    max_val.s0        = MAX_OP(max_val.s0, max_val.s1, DATA_TYPE, 1);
-
-    // Store result
-    *((__global DATA_TYPE *)dst.ptr) = max_val.s0;
-}
-
-/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
- * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
- * @note Beta can be optionally passed at compile time using -DBETA (if undefined, assume it equals 1.0)
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  max_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  max_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[in]  width                             Input image width
- */
-__kernel void softmax_layer_shift_exp_sum(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(max),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum),
-    uint width)
-{
-    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Image max = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
-
-#ifdef BETA
-    // Initialize beta
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    beta = (VEC_DATA_TYPE(DATA_TYPE, 16))BETA;
-#endif /* BETA */
-
-    // Load max value of 1D logits vector (row)
-    DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&max, 0, 0));
-
-    // Set sum vector
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    sum1D = 0;
-
-    // Shift values, exp and sum
-    const uint width4 = width >> 4;
-    for(uint i = 0; i < width4; i++)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 16)
-        data = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
-        data = SUB_OP(data, max_val, DATA_TYPE, 16);
-#ifdef BETA
-        data = MUL_OP(data, beta, DATA_TYPE, 16);
-#endif /* BETA */
-        data = EXP_OP(data, DATA_TYPE, 16);
-        vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, i << 4, 0));
-        sum1D = ADD_OP(sum1D, data, DATA_TYPE, 16);
-    }
-
-#ifdef NON_MULTIPLE_OF_16
-    // Handle non multiple of 16
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
-    data = SUB_OP(data, max_val, DATA_TYPE, 16);
-#ifdef BETA
-    data = MUL_OP(data, beta, DATA_TYPE, 16);
-#endif /* BETA */
-    data = EXP_OP(data, DATA_TYPE, 16);
-    VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)
-    widx = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16));
-    data = select(0, data, widx);
-    vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, width4 << 4, 0));
-    sum1D = ADD_OP(sum1D, data, DATA_TYPE, 16);
-#endif /* NON_MULTIPLE_OF_16 */
-
-    // Perform min/max reduction
-    sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
-    sum1D.s0123     = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
-    sum1D.s01       = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
-    sum1D.s0        = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
-
-    // Calculate and store result
-    *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
-}
-
 /** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
  *
  * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short

diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
index 31f402f..cbcde4e 100644
--- a/src/core/CL/cl_kernels/softmax_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/softmax_layer_quantized.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,76 +21,51 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "asymm_helper.h"
-#include "helpers.h"
+#include "helpers_asymm.h"
 
 #define MAX_OP(x, y, type, size) max((x), (y))
 #define ADD_OP(x, y, type, size) ((x) + (y))
 
-__constant uchar16 type_min = 0;
-__constant uint16 idx16     = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+/* Number of workitems in dimension 0. */
+#if !defined(GRID_SIZE)
+#define GRID_SIZE 1
+#endif /* !defined(GRID_SIZE) */
 
-/** Identifies the maximum value across the 1st dimension.
- *
- * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  width                             Input image width
- */
-__kernel void softmax_layer_max_quantized(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint width)
-{
-    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+#if VECTOR_SIZE == 2
+__constant uint2 idx__ = (uint2)(0, 1);
+#define asymm_mult(a, b) ASYMM_MULT(a, b, 2)
+#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 2)
+#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 2)
 
-    // Initialize local maximum
-    uchar16 max_val = 0;
+#elif VECTOR_SIZE == 4
+__constant uint4 idx__ = (uint4)(0, 1, 2, 3);
+#define asymm_mult(a, b) ASYMM_MULT(a, b, 4)
+#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 4)
+#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 4)
 
-    // Calculate max of row
-    const uint width4 = width >> 4;
-    for(uint i = 0; i < width4; i++)
-    {
-        uchar16 data = vload16(0, (__global uchar *)offset(&src, i << 4, 0));
-        max_val      = MAX_OP(data, max_val, uchar, 16);
-    }
+#elif VECTOR_SIZE == 8
+__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
+#define asymm_mult(a, b) ASYMM_MULT(a, b, 8)
+#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 8)
+#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 8)
 
-#ifdef NON_MULTIPLE_OF_16
-    // Handle non multiple of 16
-    uchar16 data = vload16(0, (__global uchar *)offset(&src, width4 << 4, 0));
-    uchar16 widx = convert_uchar16(((uint16)(width4 << 4) + idx16) < width);
-    max_val      = MAX_OP(max_val, select(type_min, data, widx), uchar, 16);
-#endif /* NON_MULTIPLE_OF_16 */
+#else /* VECTOR_SIZE DEFAULT */
+#define VECTOR_SIZE 16
+#define LOG_VECTOR_SIZE 4
+__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+#define asymm_mult(a, b) ASYMM_MULT(a, b, 16)
+#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 16)
+#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 16)
 
-    // Perform max reduction
-    max_val.s01234567 = MAX_OP(max_val.s01234567, max_val.s89ABCDEF, uchar, 8);
-    max_val.s0123     = MAX_OP(max_val.s0123, max_val.s4567, uchar, 4);
-    max_val.s01       = MAX_OP(max_val.s01, max_val.s23, uchar, 2);
-    max_val.s0        = MAX_OP(max_val.s0, max_val.s1, uchar, 1);
+#endif /* VECTOR_SIZE END */
 
-    // Store result
-    *((__global uchar *)dst.ptr) = max_val.s0;
-}
+#define VEC_UCHAR VEC_DATA_TYPE(uchar, VECTOR_SIZE)
+#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE)
 
 #if defined(DIFF_MIN)
 
-int16 mult_by_quantized_multiplier(int16 data)
+VEC_INT mult_by_quantized_multiplier_serial(VEC_INT data)
 {
 #if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
     if(INPUT_BETA_MULTIPLIER > 1)
@@ -101,10 +76,21 @@
     return data;
 }
 
+int4 mult_by_quantized_multiplier_parallel(int4 data)
+{
+#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
+    if(INPUT_BETA_MULTIPLIER > 1)
+    {
+        return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 4);
+    }
+#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
+    return data;
+}
+
 /** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
  * then gets the exponent of each element as sums all elements across each row.
  *
- * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
+ * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
  * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
  * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
  *
@@ -142,62 +128,393 @@
  * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
  * @param[in]  width                             Input image width
  */
-__kernel void softmax_layer_shift_exp_sum_quantized(
+__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(max),
+    TENSOR3D_DECLARATION(maxo),
     TENSOR3D_DECLARATION(dst),
     TENSOR3D_DECLARATION(sum),
     uint width)
 {
-    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Image max = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+    Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
+    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
 
-    // Load max value of 1D logits vector (row)
-    int max_val = convert_int(*((__global uchar *)offset(&max, 0, 0)));
+    VEC_UCHAR max_val_vec = 0;
 
-    // Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)
-    int16 sum1D = 0;
-
-    // Shift values, exp and sum
-    const uint width4 = width >> 4;
+    // Calculate max of row
+    const uint width4 = width >> LOG_VECTOR_SIZE;
     for(uint i = 0; i < width4; i++)
     {
-        uchar16 data         = vload16(0, (__global uchar *)offset(&src, i << 4, 0));
-        int16 data_fp        = convert_int16(data);
-        int16 data_diff      = data_fp - max_val;
-        int16 data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp              = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
-        data_fp              = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
-        vstore16(data_diff, 0, (__global int *)offset(&dst, i << 4, 0));
-        sum1D = sum1D + select(0, data_fp, data_diff >= (int16)(DIFF_MIN));
+        VEC_UCHAR data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, i << LOG_VECTOR_SIZE, 0));
+        max_val_vec    = MAX_OP(data, max_val_vec, uchar, 16);
     }
 
-#ifdef NON_MULTIPLE_OF_16
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
     // Handle non multiple of 16
-    uchar16 data         = vload16(0, (__global uchar *)offset(&src, width4 << 4, 0));
-    int16 data_fp        = convert_int16(data);
-    int16 data_diff      = data_fp - max_val;
-    int16 data_diff_mult = mult_by_quantized_multiplier(data_diff);
-    data_fp              = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
-    data_fp              = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
-    int16 widx           = convert_int16(((uint16)(width4 << 4) + idx16) < width);
-    vstore16(data_diff, 0, (__global int *)offset(&dst, width4 << 4, 0));
-    data_fp = select(0, data_fp, data_diff >= (int16)(DIFF_MIN));
-    sum1D   = sum1D + select(0, data_fp, widx);
-#endif /* NON_MULTIPLE_OF_16 */
+    VEC_UCHAR uchar_min = (VEC_UCHAR)0;
+    VEC_UCHAR data      = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));
+    VEC_UCHAR widx      = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_UCHAR);
+    max_val_vec         = MAX_OP(max_val_vec, select(uchar_min, data, widx), uchar, 16);
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
 
-    // Perform min/max reduction
-    sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, qs16, 8);
-    sum1D.s0123     = ADD_OP(sum1D.s0123, sum1D.s4567, qs16, 4);
-    sum1D.s01       = ADD_OP(sum1D.s01, sum1D.s23, qs16, 2);
-    sum1D.s0        = ADD_OP(sum1D.s0, sum1D.s1, qs16, 1);
+    // Perform max reduction
+#if VECTOR_SIZE == 16
+    max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, uchar, 8);
+#endif /* VECTOR SIZE 16 END */
+#if VECTOR_SIZE >= 8
+    max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, uchar, 4);
+#endif /* VECTOR SIZE 8 END */
+#if VECTOR_SIZE >= 4
+    max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, uchar, 2);
+#endif /* VECTOR SIZE 4 END */
+    max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, uchar, 1);
+
+    // Store result
+    *((__global uchar *)maxo.ptr) = max_val_vec.s0;
+
+    // Second part
+
+    // Load max value of 1D logits vector (row)
+    int max_val = convert_int(*((__global uchar *)offset(&maxo, 0, 0)));
+
+    // Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)
+    VEC_INT sum1D = 0;
+
+    // Shift values, exp and sum
+    for(uint i = 0; i < width4; i++)
+    {
+        VEC_UCHAR data         = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, i << LOG_VECTOR_SIZE, 0));
+        VEC_INT data_fp        = CONVERT(data, VEC_INT);
+        VEC_INT data_diff      = data_fp - max_val;
+        VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);
+        data_fp                = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
+        data_fp                = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
+        VSTORE(VECTOR_SIZE)
+        (data_diff, 0, (__global int *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
+        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
+    }
+
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    // Handle non multiple of 16
+    data                   = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));
+    VEC_INT data_fp        = CONVERT(data, VEC_INT);
+    VEC_INT data_diff      = data_fp - max_val;
+    VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);
+    data_fp                = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
+    data_fp                = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
+    VEC_INT widx_          = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_INT);
+    VSTORE(VECTOR_SIZE)
+    (data_diff, 0, (__global int *)offset(&dst, width4 << LOG_VECTOR_SIZE, 0));
+    data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
+    sum1D   = sum1D + select(0, data_fp, widx_);
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+
+    // Perform sum reduction
+#if VECTOR_SIZE == 16
+    sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, uchar, 8);
+#endif /* VECTOR SIZE 16 END */
+#if VECTOR_SIZE >= 8
+    sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, uchar, 4);
+#endif /* VECTOR SIZE 8 END */
+#if VECTOR_SIZE >= 4
+    sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, uchar, 2);
+#endif /* VECTOR SIZE 4 END */
+    sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, uchar, 1);
 
     // Calculate and store result
     *((__global int *)sum.ptr) = sum1D.s0;
 }
 
+/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
+ * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
+ * @param[in]  width                              Input image width
+ */
+__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(maxo),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum),
+    uint width)
+{
+    Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
+    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+    const uint4 idx4 = (uint4)(0, 1, 2, 3);
+    const uint  lid  = get_local_id(0);
+
+    // Define one temporary vector per work-item.
+    __local int4 tmp_local[GRID_SIZE];
+    __local uchar max_local;
+
+    uchar4 uchar_min   = (uchar4)0;
+    uchar4 max_val_vec = uchar_min;
+
+    // Number of elements per work-item.
+    const uint row = width / GRID_SIZE;
+    // Number of iterations per work-item.
+    const uint width_ = row >> 2;
+    // Calculate max of row
+    uint i = 0;
+    for(; i < width_; i++)
+    {
+        uchar4 data_max = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));
+        max_val_vec     = MAX_OP(data_max, max_val_vec, uchar, 4);
+    }
+#ifdef NON_MULTIPLE_OF_GRID_SIZE
+    // How many work-items needed to complete the computation.
+    int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+    if(lid < boundary_workitems)
+    {
+        uchar4 data_max = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));
+        max_val_vec     = MAX_OP(data_max, max_val_vec, uchar, 4);
+    }
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    if(boundary_workitems == 0)
+    {
+        boundary_workitems = GRID_SIZE;
+        i--;
+    }
+    if(lid == (boundary_workitems - 1))
+    {
+        // Handle non multiple of 4
+        uchar4 data_max = vload4(0, (__global uchar *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
+        uchar4 widx     = convert_uchar4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);
+        max_val_vec     = MAX_OP(max_val_vec, select(uchar_min, data_max, widx), uchar, 4);
+    }
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+#endif /* NON_MULTIPLE_OF_GRID_SIZE */
+    tmp_local[lid] = convert_int4(max_val_vec);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(GRID_SIZE >= 256)
+    {
+        if(lid < 128)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 128)
+    {
+        if(lid < 64)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 64)
+    {
+        if(lid < 32)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 32)
+    {
+        if(lid < 16)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 16)
+    {
+        if(lid < 8)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 8)
+    {
+        if(lid < 4)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 4)
+    {
+        if(lid < 2)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lid == 0)
+    {
+        max_val_vec     = MAX_OP(convert_uchar4(tmp_local[lid + 1]), convert_uchar4(tmp_local[lid]), uchar, 4);
+        max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, uchar, 2);
+        max_val_vec.s0  = MAX_OP(max_val_vec.s0, max_val_vec.s1, uchar, 1);
+        max_local       = max_val_vec.s0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    /* Second section */
+
+    // Set sum vector
+    int4 sum1D   = 0;
+    int  max_val = convert_int(max_local);
+
+    // Shift values, exp and sum
+    for(i = 0; i < width_; i++)
+    {
+        uchar4 data         = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));
+        int4 data_fp        = convert_int4(data);
+        int4 data_diff      = data_fp - max_val;
+        int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
+        data_fp             = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
+        data_fp             = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
+        vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));
+        sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
+    }
+#ifdef NON_MULTIPLE_OF_GRID_SIZE
+    boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+    if(lid < boundary_workitems)
+    {
+        uchar4 data         = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));
+        int4 data_fp        = convert_int4(data);
+        int4 data_diff      = data_fp - max_val;
+        int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
+        data_fp             = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
+        data_fp             = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
+        vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));
+        sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
+    }
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    if(boundary_workitems == 0)
+    {
+        boundary_workitems = GRID_SIZE;
+        i--;
+    }
+    if(lid == (boundary_workitems - 1))
+    {
+        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
+        uchar4 data         = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4 + 4, 0));
+        int4 data_fp        = convert_int4(data);
+        int4 data_diff      = data_fp - max_val;
+        int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
+        data_fp             = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
+        data_fp             = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
+        int4 widx           = convert_int4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);
+        data_fp             = select(0, data_fp, widx);
+        vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4 + 4, 0));
+        sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
+    }
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+#endif /* NON_MULTIPLE_OF_GRID_SIZE */
+    tmp_local[lid] = sum1D;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(GRID_SIZE >= 256)
+    {
+        if(lid < 128)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 128)
+    {
+        if(lid < 64)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 64)
+    {
+        if(lid < 32)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 32)
+    {
+        if(lid < 16)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 16)
+    {
+        if(lid < 8)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 8)
+    {
+        if(lid < 4)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 4)
+    {
+        if(lid < 2)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lid == 0)
+    {
+        sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], int, 4);
+        // Perform max reduction
+        sum1D.s01                  = ADD_OP(sum1D.s01, sum1D.s23, int, 2);
+        sum1D.s0                   = ADD_OP(sum1D.s0, sum1D.s1, int, 1);
+        *((__global int *)sum.ptr) = sum1D.s0;
+    }
+}
+
 /** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
  *
  * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
@@ -247,15 +564,21 @@
     int   num_bits_over_unit      = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
     int   shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
     int16 shifted_sum_minus_one   = shifted_sum_minus_one_1;
-    int16 shifted_scale           = asymm_one_over_one_plus_x_for_x_in_0_1(shifted_sum_minus_one);
+    int16 shifted_scale           = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, 16);
 
     // It was already calculated in prev layer, should be stored into tmp output and reused
     int16 data_diff      = vload16(0, (__global int *)offset(&src, 0, 0));
-    int16 data_diff_mult = mult_by_quantized_multiplier(data_diff);
-    int16 data           = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
+    int16 data_diff_mult = data_diff;
+#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
+    if(INPUT_BETA_MULTIPLIER > 1)
+    {
+        data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 16);
+    }
+#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
+    int16 data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 16);
 
-    data = asymm_mult(shifted_scale, data);
-    data = asymm_rounding_divide_by_pow2(data, num_bits_over_unit + 31 - 8);
+    data = ASYMM_MULT(shifted_scale, data, 16);
+    data = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, 16);
     data = select(0, data, data_diff >= (int16)(DIFF_MIN));
     vstore16(convert_uchar16_sat(data), 0, (__global uchar *)offset(&dst, 0, 0));
 }

diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index d85de88..a78b3e1 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp

@@ -47,8 +47,10 @@
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->data_type() == DataType::QASYMM8) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU),
-                                    "For QASYMM8 only lower/upper bounded relu is supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->data_type() == DataType::QASYMM8) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU),
+                                    "For QASYMM8 only relu, lower bounded relu and lower-upper bounded relu are supported");
 
     // Checks performed when output is configured
     if((output != nullptr) && (output->total_size() != 0))
@@ -94,7 +96,7 @@
 } // namespace
 
 CLActivationLayerKernel::CLActivationLayerKernel()
-    : _input(nullptr), _output(nullptr)
+    : _input(nullptr), _output(nullptr), _run_in_place(false)
 {
 }
 
@@ -102,6 +104,8 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
+    _run_in_place = (output == nullptr) || (output == input);
+
     if(output != nullptr)
     {
         // Output auto inizialitation if not yet initialized
@@ -145,12 +149,15 @@
         build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
         build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
 
+        const int o1 = input->info()->quantization_info().offset;
+        // Quantized value of 0 corresponds to the offset o1
+        build_opts.emplace(("-DCONST_0=" + support::cpp11::to_string(o1)));
+
         // Set scale and offset of the input and output if they have different quantization info
         if(is_data_type_quantized_asymmetric(dt) && output != nullptr)
         {
             const float s1 = input->info()->quantization_info().scale;
             const float s2 = output->info()->quantization_info().scale;
-            const int   o1 = input->info()->quantization_info().offset;
             const int   o2 = output->info()->quantization_info().offset;
 
             if(o1 != o2 || s1 != s2)
@@ -168,7 +175,7 @@
         build_opts.emplace(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
     }
 
-    build_opts.emplace(output == nullptr ? "-DIN_PLACE" : "");
+    build_opts.emplace((_run_in_place) ? "-DIN_PLACE" : "");
     if(is_data_type_fixed_point(dt))
     {
         build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fixed_point_position)));
@@ -183,7 +190,7 @@
     _output = output;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (output == nullptr) ? nullptr : output->info());
+    auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure(win_config.second);
 
@@ -198,8 +205,9 @@
 
 Status CLActivationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
+    const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output == nullptr) ? nullptr : output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()).first);
 
     return Status{};
 }
@@ -216,7 +224,7 @@
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(_output != nullptr)
+        if(!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }

diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
index 75701ee..c4904ec 100644
--- a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,60 +24,75 @@
 #include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <cstddef>
-#include <set>
-#include <string>
 
 using namespace arm_compute;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
 {
     ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+
+    const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(&input1, &input2);
 
     // Validate in case of configured output
-    if((output != nullptr) && (output->total_size() != 0))
+    if(output.total_size() > 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((output.data_type() == DataType::U8) && ((input1.data_type() != DataType::U8) || (input2.data_type() != DataType::U8)),
                                         "Output can only be U8 if both inputs are U8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+                                        "Wrong shape for output");
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(&input1, &output);
     }
 
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
 {
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
+    const TensorShape &out_shape    = broadcast_pair.first;
+    const ValidRegion &valid_region = broadcast_pair.second;
 
-    Window win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
+    // Auto initialize output if not initialized
+    {
+        set_shape_if_empty(output, out_shape);
 
-    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+        if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
+        {
+            set_format_if_unknown(output, Format::S16);
+        }
+        else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
+        {
+            set_format_if_unknown(output, Format::F16);
+        }
+        else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
+        {
+            set_format_if_unknown(output, Format::F32);
+        }
+    }
 
-    bool window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
+    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+    Window win_input1 = win.broadcast_if_dimension_le_one(input1);
+    Window win_input2 = win.broadcast_if_dimension_le_one(input2);
 
-    ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
-                                                       input2->valid_region());
+    AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win_input1, input1_access)
+                          || update_window_and_padding(win_input2, input2_access)
+                          || update_window_and_padding(win, output_access);
 
     output_access.set_valid_region(win, valid_region);
 
@@ -94,26 +109,11 @@
 void CLArithmeticAdditionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
 
-    // Auto initialize output if not initialized
-    {
-        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
-
-        if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
-        {
-            set_format_if_unknown(*output->info(), Format::S16);
-        }
-        else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
-        {
-            set_format_if_unknown(*output->info(), Format::F32);
-        }
-        else if(input1->info()->data_type() == DataType::F16 && input2->info()->data_type() == DataType::F16)
-        {
-            set_format_if_unknown(*output->info(), Format::F16);
-        }
-    }
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), policy));
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     _input1 = input1;
     _input2 = input2;
@@ -135,16 +135,15 @@
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_add", build_opts));
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure(win_config.second);
 }
 
 Status CLArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
 
     return Status{};
 }
@@ -154,16 +153,49 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
+    const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+    const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+    const TensorShape &out_shape = _output->info()->tensor_shape();
+
+    bool can_collapse = true;
+    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+    {
+        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+        {
+            can_collapse = (in_shape1[d] == in_shape2[d]);
+        }
+    }
+
+    bool   has_collapsed = false;
+    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+
+    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+    Window slice        = collapsed.first_slice_window_3D();
+    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
 
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input1, slice);
-        add_3D_tensor_argument(idx, _input2, slice);
+
+        add_3D_tensor_argument(idx, _input1, slice_input1);
+        add_3D_tensor_argument(idx, _input2, slice_input2);
         add_3D_tensor_argument(idx, _output, slice);
+
         enqueue(queue, *this, slice);
+
+        collapsed.slide_window_slice_3D(slice_input1);
+        collapsed.slide_window_slice_3D(slice_input2);
     }
     while(collapsed.slide_window_slice_3D(slice));
 }
+
+BorderSize CLArithmeticAdditionKernel::border_size() const
+{
+    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+    return BorderSize(0, border, 0, 0);
+}

diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 663b044..87fc1d0 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,7 +42,7 @@
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
                           const ITensorInfo *mean, const ITensorInfo *var,
                           const ITensorInfo *beta, const ITensorInfo *gamma,
-                          float epsilon)
+                          float epsilon, ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
@@ -50,6 +50,14 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var, beta, gamma);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var, beta, gamma);
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != mean->dimension(0));
+    if(act_info.enabled())
+    {
+        ActivationLayerInfo::ActivationFunction act = act_info.activation();
+        ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32 && input->data_type() != DataType::F16);
+        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
+                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+        ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
+    }
 
     if(output != nullptr && output->total_size() != 0)
     {
@@ -93,12 +101,12 @@
 } // namespace
 
 CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
-    : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0)
+    : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0), _run_in_place(false)
 {
 }
 
 void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
-                                                float epsilon)
+                                                float epsilon, ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var, beta, gamma);
 
@@ -110,6 +118,8 @@
     _gamma   = gamma;
     _epsilon = epsilon;
 
+    _run_in_place = (output == nullptr) || (output == input);
+
     if(output != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), output->info());
@@ -118,41 +128,51 @@
     }
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr,
-                                                  mean->info(), var->info(), beta->info(), gamma->info(), epsilon));
+                                                  mean->info(), var->info(), beta->info(), gamma->info(), epsilon, act_info));
 
     const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
 
     // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-    build_opts.emplace(output == nullptr ? "-DIN_PLACE" : "");
-    if(is_data_type_fixed_point(input->info()->data_type()))
-    {
-        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
-    }
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option_if(act_info.enabled(), "-D" + string_from_activation_func(act_info.activation()));
+    build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+    build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+    build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
+    build_opts.add_option_if(is_data_type_fixed_point(input->info()->data_type()), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("batchnormalization_layer", build_opts));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("batchnormalization_layer", build_opts.options()));
 
     // Set kernel static arguments
-    unsigned int include_output = (output != nullptr) ? 1 : 0;
+    unsigned int include_output = (!_run_in_place) ? 1 : 0;
     unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor() + 4 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
     _kernel.setArg<cl_float>(idx++, _epsilon);
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (output == nullptr) ? nullptr : output->info());
+    auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure(win_config.second);
+
+    _config_id = "batch_normalization_layer_";
+    _config_id += string_from_data_type(input->info()->data_type());
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(2));
 }
 
 Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
                                                  const ITensorInfo *mean, const ITensorInfo *var,
                                                  const ITensorInfo *beta, const ITensorInfo *gamma,
-                                                 float epsilon)
+                                                 float epsilon, ActivationLayerInfo act_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output == nullptr) ? nullptr : output->clone().get()).first);
+    const bool run_in_place = (output == nullptr) || (output == input);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()).first);
 
     return Status{};
 }
@@ -167,7 +187,7 @@
     Window vector_slice = window.first_slice_window_1D();
     vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
 
-    unsigned int include_output = (_output != nullptr) ? 1 : 0;
+    unsigned int include_output = (!_run_in_place) ? 1 : 0;
     unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor();
     add_1D_tensor_argument(idx, _mean, vector_slice);
     add_1D_tensor_argument(idx, _var, vector_slice);
@@ -178,11 +198,11 @@
     {
         idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(_output != nullptr)
+        if(!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
-        enqueue(queue, *this, slice);
+        enqueue(queue, *this, slice, _lws_hint);
     }
     while(window.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLChannelExtractKernel.cpp b/src/core/CL/kernels/CLChannelExtractKernel.cpp
index be046cf..65843b8 100644
--- a/src/core/CL/kernels/CLChannelExtractKernel.cpp
+++ b/src/core/CL/kernels/CLChannelExtractKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,25 +49,48 @@
 
 void CLChannelExtractKernel::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON(input == output);
+
+    set_format_if_unknown(*output->info(), Format::U8);
+
+    // Check if input tensor has a valid format
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422);
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
-    ARM_COMPUTE_ERROR_ON(static_cast<const void *>(input) == static_cast<void *>(output));
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+
+    // Check if channel is valid for given format
+    const Format format = input->info()->format();
+    ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel);
+
+    // Half the processed elements for U,V channels due to sub-sampling of 2
+    _subsampling = 1;
+
+    if(format == Format::YUYV422 || format == Format::UYVY422)
+    {
+        // Check if the width of the tensor shape is even for formats with subsampled channels (UYVY422 and YUYV422)
+        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(format, input);
+
+        if(channel != Channel::Y)
+        {
+            _subsampling = 2;
+        }
+    }
+
+    // Calculate output tensor shape using subsampling
+    TensorShape output_shape = calculate_subsampled_shape(input->info()->tensor_shape(), format, channel);
+    set_shape_if_empty(*output->info(), output_shape);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
 
     _input  = input;
     _output = output;
 
-    // Check format
-    const Format format = input->info()->format();
-    ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel);
-
     // Create kernel
     std::string           kernel_name = "channel_extract_" + string_from_format(format);
     std::set<std::string> build_opts  = { ("-DCHANNEL_" + string_from_channel(channel)) };
     _kernel                           = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
 
-    // Half the processed elements for U,V channels due to sub-sampling of 2
-    _subsampling = ((Format::YUYV422 == format || Format::UYVY422 == format) && Channel::Y != channel) ? 2 : 1;
-
     // Configure window
     Window                 win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
     AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration);
@@ -83,17 +106,34 @@
 
 void CLChannelExtractKernel::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+
+    set_format_if_unknown(*output->info(), Format::U8);
+
+    // Check if channel is valid for given format
+    const Format format = input->info()->format();
+    ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel);
+
+    // Get input plane from the given channel
+    const ICLImage *input_plane = input->cl_plane(plane_idx_from_channel(format, channel));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input_plane);
+
+    if(Channel::Y == channel && format != Format::YUV444)
+    {
+        // Check if the width of the tensor shape is even for formats with subsampled channels (UYVY422 and YUYV422)
+        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(format, input_plane);
+    }
+
+    // Calculate 2x2 subsampled tensor shape
+    TensorShape output_shape = calculate_subsampled_shape(input->cl_plane(0)->info()->tensor_shape(), format, channel);
+    set_shape_if_empty(*output->info(), output_shape);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output_shape, output->info()->tensor_shape());
+
+    // Check if input tensor has a valid format
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
-    ARM_COMPUTE_ERROR_ON(static_cast<const void *>(input) == static_cast<void *>(output));
-
-    // Get format
-    const Format fmt = input->info()->format();
-
-    // Get input plane
-    const ICLImage *input_plane = input->cl_plane(plane_idx_from_channel(fmt, channel));
-    ARM_COMPUTE_ERROR_ON(nullptr == input_plane);
 
     _output      = output;
     _input       = input_plane;
@@ -102,13 +142,13 @@
     // Create kernel
     std::string           kernel_name;
     std::set<std::string> build_opts;
-    if(Channel::Y == channel || Format::IYUV == fmt || Format::YUV444 == fmt)
+    if(Channel::Y == channel || Format::IYUV == format || Format::YUV444 == format)
     {
         kernel_name = "copy_plane";
     }
     else
     {
-        kernel_name = "channel_extract_" + string_from_format(fmt);
+        kernel_name = "channel_extract_" + string_from_format(format);
         build_opts.insert(("-DCHANNEL_" + string_from_channel(channel)));
     }
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));

diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index 499e1e8..eacfa4c 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,10 +31,55 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <cmath>
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims)));
+
+    const unsigned int num_elems_read_per_iteration = is_data_type_fixed_point(input->data_type()) ? 1 : 8;
+
+    // Configure window
+    Window win = calculate_max_window(*input, Steps(num_elems_read_per_iteration));
+
+    // Update window and padding just for the input tensor as we cannot access out-of-bounds elements in the output one
+    AccessWindowHorizontal input_access(input, 0, num_elems_read_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input_access);
+
+    Coordinates coord;
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
 
 CLCol2ImKernel::CLCol2ImKernel()
     : _input(nullptr), _output(nullptr), _convolved_dims()
@@ -43,20 +88,10 @@
 
 void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = input->info()->tensor_shape();
-    output_shape.set(0, convolved_dims.first);
-    output_shape.set(1, convolved_dims.second);
-    output_shape.set(2, input->info()->tensor_shape()[0]);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), convolved_dims));
 
     _input          = input;
     _output         = output;
@@ -67,6 +102,8 @@
     // Create kernel
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
+    build_opts.add_option("-DWIDTH_INPUT=" + support::cpp11::to_string(input->info()->dimension(0)));
     build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.first));
     build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
 
@@ -87,15 +124,10 @@
         }
     }
 
-    // Configure window
-    Window win = calculate_max_window(*input->info(), Steps());
-
-    // The CLCol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
-    Coordinates coord;
-    coord.set_num_dimensions(output->info()->num_dimensions());
-    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-    ICLKernel::configure(win);
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), _convolved_dims);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
 
     // Set config_id for enabling LWS tuning
     _config_id = "col2im_";
@@ -110,6 +142,12 @@
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
+Status CLCol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, convolved_dims));
+    return Status{};
+}
+
 void CLCol2ImKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp
index fd64dc4..2b08c8d 100644
--- a/src/core/CL/kernels/CLConvolutionKernel.cpp
+++ b/src/core/CL/kernels/CLConvolutionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,8 +63,8 @@
     _input  = input;
     _output = output;
 
-    std::stringstream     kernel_name;
-    std::set<std::string> options;
+    std::stringstream kernel_name;
+    CLBuildOptions    build_opts;
     kernel_name << "convolution" << matrix_size << "x" << matrix_size << "_static";
 
     if(scale == 0)
@@ -76,19 +76,19 @@
     {
         std::stringstream mat_str;
         mat_str << "-DMAT" << i << "=" << conv[i];
-        options.insert(mat_str.str());
+        build_opts.add_option(mat_str.str());
     }
 
-    options.insert("-DSCALE=" + support::cpp11::to_string(scale));
+    build_opts.add_option("-DSCALE=" + support::cpp11::to_string(scale));
 
     DataType data_type = data_type_for_convolution_matrix(conv, matrix_size * matrix_size);
-    options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
 
     std::stringstream out_type;
     out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
-    options.insert(out_type.str());
+    build_opts.add_option(out_type.str());
 
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), options));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), build_opts.options()));
 
     // Configure kernel window
     constexpr unsigned int num_elems_processed_per_iteration = 8;

diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index 5c08d5b..650c5b8 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,12 +42,12 @@
                                                     const PadStrideInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_UNUSED(info);
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
 
     for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
     {
@@ -79,9 +79,8 @@
     constexpr unsigned int num_elems_processed_per_iteration = 1;
 
     // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal output_access(output->info(), 0, 0, num_elems_processed_per_iteration);
+    Window                 win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
     output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     ICLKernel::configure(win);

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
index 1c0fe99..29564b3 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp

@@ -51,13 +51,15 @@
 
 void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
 
+    bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type());
+
     if(biases != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(weights->info()->data_type()))
+        if(is_qasymm)
         {
             ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -98,82 +100,124 @@
     build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
     build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
 
-    // Create kernel
-    std::string kernel_name = is_data_type_quantized_asymmetric(_input->info()->data_type()) ? "depthwise_convolution_3x3_quantized" : "depthwise_convolution_3x3";
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
-    // Set static arguments
-    if(is_data_type_quantized_asymmetric(_input->info()->data_type()))
+    if(is_qasymm)
     {
         float multiplier        = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale;
         int   output_multiplier = 0;
         int   output_shift      = 0;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
 
-        unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0);
-
-        _kernel.setArg(idx++, -_input->info()->quantization_info().offset);
-        _kernel.setArg(idx++, -_weights->info()->quantization_info().offset);
-        _kernel.setArg(idx++, _output->info()->quantization_info().offset);
-        _kernel.setArg(idx++, output_multiplier);
-        _kernel.setArg(idx++, output_shift);
+        build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
+        build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-_input->info()->quantization_info().offset));
+        build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-_weights->info()->quantization_info().offset));
+        build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(_output->info()->quantization_info().offset));
+        build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * input->info()->quantization_info().offset * weights->info()->quantization_info().offset));
+        build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+        build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
     }
 
     // Configure the local work size for Bifrost with a value obtained
     // via exhaustive autotuning for the MobileNets tensor shapes.
     const GPUTarget gpu_target = get_arch_from_target(get_target());
-    if(gpu_target == GPUTarget::BIFROST)
-    {
-        // Assume uniform padding and striding.
-        const size_t pad    = _conv_pad_left;
-        const size_t stride = _conv_stride_x;
-        const size_t width  = input->info()->dimension(0);
-        if(pad == 1)
-        {
-            const size_t width_by_stride = width / stride;
-            if(width_by_stride == 28) // 56/2 or 28/1
-            {
-                _lws_hint = cl::NDRange(7, 4, 3);
-            }
-            else if(width_by_stride == 14) // 28/2 or 14/1
-            {
-                _lws_hint = cl::NDRange(7, 7, 4);
-            }
-        }
-        else if(pad == 0)
-        {
-            if(width >= 56) // 56 or 112
-            {
-                _lws_hint = cl::NDRange(8, 5, 2);
-            }
-            else if(width >= 14) // 14 or 28
-            {
-                _lws_hint = cl::NDRange(1, 5, 2);
-            }
-            else // 7
-            {
-                _lws_hint = cl::NDRange(1, 1, 2);
-            }
-        }
-    }
 
     // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = 8 / data_size_from_type(input->info()->data_type());
-    const unsigned int num_elems_written_per_iteration   = num_elems_processed_per_iteration;
-    const unsigned int num_elems_read_per_iteration      = 3 + (num_elems_processed_per_iteration - 1) * _conv_stride_x;
-    const unsigned int num_rows_read_per_iteration       = 3;
+    unsigned int num_elems_read_per_iteration_x    = 0;
+    unsigned int num_elems_read_per_iteration_y    = 0;
+    unsigned int num_elems_written_per_iteration_x = 0;
+    unsigned int num_elems_written_per_iteration_y = 0;
 
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    // Create kernel
+    std::string kernel_name;
 
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration, _conv_stride_x, _conv_stride_y);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-    AccessWindowStatic     weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1));
+    if(input->info()->data_type() == DataType::F16)
+    {
+        kernel_name                       = "depthwise_convolution_3x3_f16";
+        num_elems_written_per_iteration_x = 8 / data_size_from_type(input->info()->data_type());
+        num_elems_written_per_iteration_y = 1;
+        num_elems_read_per_iteration_y    = 3;
+        switch(_conv_stride_x)
+        {
+            case 1:
+                num_elems_read_per_iteration_x = 8;
+                break;
+            case 2:
+                num_elems_read_per_iteration_x = 9;
+                break;
+            case 3:
+                num_elems_read_per_iteration_x = 16;
+                break;
+            default:
+                num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * _conv_stride_x;
+                break;
+        }
+    }
+    else if(input->info()->data_type() == DataType::F32 && gpu_target == GPUTarget::BIFROST)
+    {
+        if(_conv_stride_x == 1 && _conv_stride_y == 1)
+        {
+            kernel_name                       = "depthwise_convolution_3x3_stridex1_stridey1_bifrost";
+            num_elems_read_per_iteration_x    = 4;
+            num_elems_read_per_iteration_y    = 6;
+            num_elems_written_per_iteration_x = 2;
+            num_elems_written_per_iteration_y = 4;
+        }
+        else if(_conv_stride_x == 2 && _conv_stride_y == 2)
+        {
+            kernel_name                       = "depthwise_convolution_3x3_stridex2_stridey2_bifrost";
+            num_elems_read_per_iteration_x    = 6;
+            num_elems_read_per_iteration_y    = 5;
+            num_elems_written_per_iteration_x = 2;
+            num_elems_written_per_iteration_y = 2;
+        }
+        else
+        {
+            kernel_name                       = "depthwise_convolution_3x3";
+            num_elems_written_per_iteration_x = 8 / data_size_from_type(input->info()->data_type());
+            num_elems_written_per_iteration_y = 1;
+            num_elems_read_per_iteration_x    = 3 + (num_elems_written_per_iteration_x - 1) * _conv_stride_x;
+            num_elems_read_per_iteration_y    = 3;
+        }
+    }
+    else
+    {
+        kernel_name                       = is_qasymm ? "depthwise_convolution_3x3_quantized" : "depthwise_convolution_3x3";
+        num_elems_written_per_iteration_x = 8 / data_size_from_type(input->info()->data_type());
+        num_elems_written_per_iteration_y = (is_qasymm && _conv_stride_y < 3) ? (2 / _conv_stride_y) : 1;
+        num_elems_read_per_iteration_x    = 3 + (num_elems_written_per_iteration_x - 1) * _conv_stride_x;
+        num_elems_read_per_iteration_y    = num_elems_written_per_iteration_y + 2;
+    }
+
+    // Create window and update padding
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
+
+    AccessWindowRectangle input_access(input->info(), -_conv_pad_left, -_conv_pad_top,
+                                       num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
+                                       _conv_stride_x, _conv_stride_y);
+    AccessWindowStatic    weights_access(weights->info(), 0, 0, 3, 3);
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
 
     update_window_and_padding(win, input_access, weights_access, output_access);
 
     output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     ICLKernel::configure(win);
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
 void CLDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
index ad9ac0e..9851475 100644
--- a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,9 +44,10 @@
 
 void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && has_bias);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
 
@@ -54,24 +55,25 @@
     _output = output;
 
     // Create kernel
-    std::set<std::string> build_opts;
+    CLBuildOptions build_opts;
 
-    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.emplace("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
-    build_opts.emplace("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
-    build_opts.emplace("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
-    build_opts.emplace("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
-    build_opts.emplace("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right()));
-    build_opts.emplace("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
-    build_opts.emplace("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
-    build_opts.emplace("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
-    build_opts.emplace("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
-    build_opts.emplace("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
-    if(has_bias)
-    {
-        build_opts.emplace("-DHAS_BIAS");
-    }
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_im2col", build_opts));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
+    build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
+    build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+    build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+    build_opts.add_option("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right()));
+    build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
+    build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
+    build_opts.add_option_if(has_bias, "-DHAS_BIAS");
+    build_opts.add_option_if_else(is_data_type_quantized_asymmetric(input->info()->data_type()),
+                                  "-DPAD_VALUE=" + support::cpp11::to_string(input->info()->quantization_info().offset),
+                                  "-DPAD_VALUE=0");
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_im2col", build_opts.options()));
 
     // Configure the local work size for Bifrost with a value obtained
     // via exhaustive autotuning for the MobileNets tensor shapes.

diff --git a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
index dc47bb0..83fc168 100644
--- a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@
 
 void CLDepthwiseVectorToTensorKernel::configure(const ICLTensor *input, ICLTensor *output, size_t conv_w, size_t conv_h)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
     TensorShape output_shape = input->info()->tensor_shape();
@@ -50,7 +50,7 @@
     output_shape.set(2, input->info()->tensor_shape()[0] / (conv_w * conv_h));
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -60,12 +60,12 @@
     _output = output;
 
     // Create kernel
-    std::set<std::string> build_opts;
-    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.emplace("-DCONV_WIDTH=" + support::cpp11::to_string(conv_w));
-    build_opts.emplace("-DCONV_HEIGHT=" + support::cpp11::to_string(conv_h));
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DCONV_WIDTH=" + support::cpp11::to_string(conv_w));
+    build_opts.add_option("-DCONV_HEIGHT=" + support::cpp11::to_string(conv_h));
 
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_vector_to_tensor", build_opts));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_vector_to_tensor", build_opts.options()));
 
     // Configure  kernel window
     Window win = calculate_max_window(*input->info(), Steps());

diff --git a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
index 81dd6b4..26da96f 100644
--- a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,9 +41,10 @@
 
 void CLDepthwiseWeightsReshapeKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *biases)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && (biases != nullptr));
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(1));
     ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) + ((biases != nullptr) ? 1 : 0)));
 

diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
index 216fa27..4efdb76 100644
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -97,5 +97,5 @@
         add_1D_tensor_argument(idx, _min_max, min_max_slice);
         enqueue(queue, *this, slice);
     }
-    while(window.slide_window_slice_3D(slice) && min_max_window.slide_window_slice_1D(min_max_slice));
+    while(window_collapsed.slide_window_slice_3D(slice) && min_max_window.slide_window_slice_1D(min_max_slice));
 }

diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 76fdb6d..86a3581 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -117,19 +117,16 @@
     TensorShape output_shape = get_output_shape(input->tensor_shape(), weights->tensor_shape(), conv_info);
 
     // Output auto inizialitation if not yet initialized
-    //input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
     auto_init_if_empty(*output, output_shape,
                        1,
                        input->data_type(),
                        input->fixed_point_position(),
                        input->quantization_info());
 
-    unsigned int conv_stride_x   = std::get<0>(conv_info.stride());
-    unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-    unsigned int conv_pad_left   = std::max(conv_info.pad_left(), kernel_size / 2);
-    unsigned int conv_pad_top    = std::max(conv_info.pad_top(), kernel_size / 2);
-    unsigned int conv_pad_right  = std::max(conv_info.pad_right(), kernel_size / 2);
-    unsigned int conv_pad_bottom = std::max(conv_info.pad_bottom(), kernel_size / 2);
+    unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    unsigned int conv_stride_y = std::get<1>(conv_info.stride());
+    unsigned int conv_pad_left = conv_info.pad_left();
+    unsigned int conv_pad_top  = conv_info.pad_top();
 
     unsigned int num_elems_read_per_iteration_x    = 0;
     unsigned int num_elems_read_per_iteration_y    = 0;
@@ -239,19 +236,13 @@
         }
     }
 
-    // Calculate right and bottom border
-    int input_width  = input->dimension(0) + conv_pad_left + conv_pad_right;
-    int input_height = input->dimension(1) + conv_pad_top + conv_pad_bottom;
-
-    // Add padding only if necessary or it would always result in a window_changed
-    input_width  = ceil_to_multiple(input_width, num_elems_read_per_iteration_x);
-    input_height = ceil_to_multiple(input_height, num_elems_read_per_iteration_y);
-
     // Create window and update padding
     bool   window_changed = false;
     Window win            = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
 
-    AccessWindowStatic    input_access(input, -conv_pad_left, -conv_pad_top, input_width, input_height);
+    AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top,
+                                       num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
+                                       conv_stride_x, conv_stride_y);
     AccessWindowStatic    weights_access(weights, 0, 0, kernel_size, kernel_size);
     AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
 
@@ -285,7 +276,6 @@
     TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);
 
     // Output auto inizialitation if not yet initialized
-    //input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
     auto_init_if_empty(*output->info(),
                        output_shape,
                        1,
@@ -302,18 +292,13 @@
 
     _conv_stride_x = std::get<0>(conv_info.stride());
     _conv_stride_y = std::get<1>(conv_info.stride());
+    _border_size   = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
 
     _input   = input;
     _weights = weights;
     _output  = output;
     _biases  = biases;
 
-    int conv_pad_left   = std::min(conv_info.pad_left(), kernel_size / 2);
-    int conv_pad_top    = std::min(conv_info.pad_top(), kernel_size / 2);
-    int conv_pad_right  = std::min(conv_info.pad_right(), kernel_size / 2);
-    int conv_pad_bottom = std::min(conv_info.pad_bottom(), kernel_size / 2);
-    _border_size        = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
-
     const GPUTarget gpu_target = get_arch_from_target(get_target());
 
     std::stringstream kernel_name;
@@ -450,13 +435,13 @@
     _config_id += "_";
     _config_id += support::cpp11::to_string(kernel_size);
     _config_id += "_";
-    _config_id += support::cpp11::to_string(conv_pad_left);
+    _config_id += support::cpp11::to_string(border_size().left);
     _config_id += "_";
-    _config_id += support::cpp11::to_string(conv_pad_top);
+    _config_id += support::cpp11::to_string(border_size().top);
     _config_id += "_";
-    _config_id += support::cpp11::to_string(conv_pad_right);
+    _config_id += support::cpp11::to_string(border_size().right);
     _config_id += "_";
-    _config_id += support::cpp11::to_string(conv_pad_bottom);
+    _config_id += support::cpp11::to_string(border_size().bottom);
     _config_id += "_";
     _config_id += support::cpp11::to_string(_conv_stride_x);
     _config_id += "_";

diff --git a/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp
new file mode 100644
index 0000000..f23ecf3
--- /dev/null
+++ b/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp

@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerOutputStageKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16,
+                                                         DataType::F32);
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32, DataType::F16, DataType::F32);
+
+        if(is_data_type_quantized_asymmetric(input->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        }
+
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_float(input->data_type()),
+                                        "Calling output stage kernel with floating point arguments");
+    }
+
+    // Checks performed on output
+    if(input->data_type() == DataType::S32)
+    {
+        // Quantized configuration checks
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+    }
+    else
+    {
+        // In case of out-of-place computation (supported for non-quantized configurations)
+        if((output != nullptr) && (output->total_size() != 0))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        }
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
+{
+    bool         window_changed                    = false;
+    unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->data_type());
+
+    // Update processed elements when input is S32 (comes from quantization input)
+    if(input->data_type() == DataType::S32)
+    {
+        num_elems_processed_per_iteration = 16;
+    }
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+
+    if(output != nullptr && (output->total_size() != 0))
+    {
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+        if(bias == nullptr)
+        {
+            window_changed = update_window_and_padding(win, input_access, output_access);
+        }
+        else
+        {
+            AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
+            window_changed = update_window_and_padding(win, input_access, output_access, bias_access);
+        }
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+    else
+    {
+        if(bias == nullptr)
+        {
+            window_changed = update_window_and_padding(win, input_access);
+        }
+        else
+        {
+            AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
+            window_changed = update_window_and_padding(win, input_access, bias_access);
+        }
+
+        input_access.set_valid_region(win, ValidRegion(Coordinates(), input->tensor_shape()));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLDirectConvolutionLayerOutputStageKernel::CLDirectConvolutionLayerOutputStageKernel()
+    : _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0)
+{
+}
+
+void CLDirectConvolutionLayerOutputStageKernel::configure(ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+                                                          int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+    // Auto-initialize output if required
+    if(output != nullptr)
+    {
+        // Work out expected output data type
+        const DataType output_dt = (input->info()->data_type() == DataType::S32) ? DataType::QASYMM8 : input->info()->data_type();
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_dt));
+    }
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias == nullptr) ? nullptr : bias->info(), (output == nullptr) ? nullptr : output->info()));
+
+    _bias                         = bias;
+    _input                        = input;
+    _output                       = output;
+    _result_fixedpoint_multiplier = result_fixedpoint_multiplier;
+    _result_shift                 = result_shift;
+    _result_offset_after_shift    = result_offset_after_shift;
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(bias != nullptr, "-DHAS_BIAS");
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("output_stage_quantized", build_opts.options()));
+
+    // Set static kernel arguments
+    int idx = 2 * num_arguments_per_3D_tensor() + ((bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
+    _kernel.setArg<int>(idx++, _result_offset_after_shift);
+    _kernel.setArg<int>(idx++, _result_fixedpoint_multiplier);
+    _kernel.setArg<int>(idx++, _result_shift);
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), (bias == nullptr) ? nullptr : bias->info(), (output == nullptr) ? nullptr : output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+}
+
+Status CLDirectConvolutionLayerOutputStageKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), bias->clone().get(), output == nullptr ? nullptr : output->clone().get()).first);
+
+    return Status{};
+}
+
+void CLDirectConvolutionLayerOutputStageKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    // Set bias vector
+    if(_bias != nullptr)
+    {
+        unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
+        Window       slice_biases;
+        slice_biases.use_tensor_dimensions(_bias->info()->tensor_shape());
+        add_1D_tensor_argument(idx1, _bias, slice_biases);
+    }
+
+    // Run kernel
+    do
+    {
+        // Set arguments
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, _lws_hint);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/CL/kernels/CLFastCornersKernel.cpp b/src/core/CL/kernels/CLFastCornersKernel.cpp
index 1d4d776..616e41b 100644
--- a/src/core/CL/kernels/CLFastCornersKernel.cpp
+++ b/src/core/CL/kernels/CLFastCornersKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -138,7 +138,7 @@
 
     // Set static kernel arguments
     unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input and output parameters
-    _kernel.setArg<unsigned int>(idx++, corners->max_num_values());
+    _kernel.setArg<unsigned int>(idx++, _corners->max_num_values());
     _kernel.setArg<cl_uint>(idx++, offset);
     _kernel.setArg(idx++, *_num_buffer);
     _kernel.setArg(idx++, _corners->cl_buffer());

diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index 6886f54..241dd85 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,17 +40,16 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON(mult_interleave4x4_height < 1);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
                                                          DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input, mult_interleave4x4_height));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     }
@@ -58,11 +57,11 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, int mult_interleave4x4_height)
 {
-    unsigned int           num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->data_type());
+    constexpr unsigned int num_elems_processed_per_iteration_x = 4;
     constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-    const unsigned int     num_elems_written_per_iteration     = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y;
+    const unsigned int     num_elems_written_per_iteration     = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y * mult_interleave4x4_height;
     bool                   window_changed                      = false;
 
     // Configure kernel window
@@ -73,7 +72,10 @@
     // Configure window in case of configured output
     if(output->total_size() != 0)
     {
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f);
+        const float scale_x = 4.0f * static_cast<float>(mult_interleave4x4_height);
+        const float scale_y = 1.0f / (scale_x);
+
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, scale_x, scale_y);
         window_changed = window_changed || update_window_and_padding(win, output_access);
         output_access.set_valid_region(win, input->valid_region());
     }
@@ -88,25 +90,42 @@
 {
 }
 
-void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output)
+void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(*input->info())));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(*input->info(), mult_interleave4x4_height)));
 
     // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mult_interleave4x4_height));
 
     _input  = input;
     _output = output;
 
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
+    switch(input->info()->element_size())
+    {
+        case 1:
+            build_opts.add_option("-DDATA_TYPE=uchar");
+            break;
+        case 2:
+            build_opts.add_option("-DDATA_TYPE=ushort");
+            break;
+        case 4:
+            build_opts.add_option("-DDATA_TYPE=uint");
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported");
+    }
+
     // Create kernel
-    std::string kernel_name = "gemm_interleave4x4_" + support::cpp11::to_string(input->info()->element_size() * 8) + "bit";
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_interleave4x4", build_opts.options()));
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info());
+    auto win_config = validate_and_configure_window(input->info(), output->info(), mult_interleave4x4_height);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure(win_config.second);
 
@@ -119,10 +138,10 @@
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
-Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mult_interleave4x4_height));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), mult_interleave4x4_height).first);
 
     return Status{};
 }
@@ -144,10 +163,6 @@
     Window in_slice  = window.first_slice_window_2D();
     Window out_slice = window.first_slice_window_2D();
 
-    // Change x and y steps for the slide of output tensor
-    out_slice.scale(Window::DimX, 4.f);
-    out_slice.scale(Window::DimY, 0.25f);
-
     do
     {
         unsigned int idx = 0;

diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
index 423592b..ae498ec 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,8 @@
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
@@ -33,6 +35,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "support/ToolchainSupport.h"
 
 #include <cstddef>
@@ -40,6 +43,7 @@
 #include <tuple>
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
 
 namespace arm_compute
 {
@@ -50,14 +54,53 @@
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed)
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+
     if(!is_interleaved_transposed)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+
+        if(output->total_size() != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));
+            ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+        }
+    }
+    else
+    {
+        const int m                         = reshape_info.m();
+        const int n                         = reshape_info.n();
+        const int k                         = reshape_info.k();
+        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
+        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+
+        TensorShape tensor_shape0{ input0->tensor_shape() };
+        tensor_shape0.set(0, k);
+        tensor_shape0.set(1, m);
+
+        TensorShape tensor_shape1{ input1->tensor_shape() };
+        tensor_shape1.set(0, n);
+        tensor_shape1.set(1, k);
+
+        const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
+        const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
+
+        const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
+        const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+        if(output->total_size() != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n));
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+        }
     }
 
     return Status{};
@@ -75,16 +118,14 @@
     // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
     if(is_interleaved_transposed)
     {
-        // Configure window
-        num_elems_processed_per_iteration_x                        = 16;
-        num_elems_processed_per_iteration_y                        = 4;
-        constexpr unsigned int num_elems_read_per_iteration_input0 = 4;
-        constexpr unsigned int num_elems_read_per_iteration_input1 = 16;
+        // Configure kernel window
+        num_elems_processed_per_iteration_x = 4;
+        num_elems_processed_per_iteration_y = 4;
 
         win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-        AccessWindowRectangle input0_access(input0, 0, 0, num_elems_read_per_iteration_input0, 1);
-        AccessWindowRectangle input1_access(input1, 0, 0, num_elems_read_per_iteration_input1, 1);
+        AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
+        AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
         AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
 
         window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
@@ -94,8 +135,8 @@
     else
     {
         // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
-        num_elems_processed_per_iteration_x = 16;
-        num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
+        num_elems_processed_per_iteration_x = 4;
+        num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 5);
 
         // Configure window
         win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
@@ -121,10 +162,18 @@
 {
 }
 
-void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed)
+void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed));
+
+    // Output tensor auto inizialitation if not yet initialized
+    TensorShape tensor_shape{ input0->info()->tensor_shape() };
+    tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->info()->dimension(0));
+    tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->info()->dimension(1));
+
+    auto_init_if_empty(*output->info(), tensor_shape, 1, DataType::S32, 1, QuantizationInfo());
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
 
     _input0 = input0;
     _input1 = input1;
@@ -132,6 +181,9 @@
 
     ElementsProcessed num_elements_processed{};
 
+    // Get target architecture
+    GPUTarget arch_target = get_arch_from_target(get_target());
+
     // Configure kernel window
     auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
@@ -142,15 +194,25 @@
     std::string    kernel_name(" ");
     if(is_interleaved_transposed)
     {
+        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
+        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+
+        // Note: The computation tile has the x dimension equal to 4 which is less than the transpose_width (16)
+        //        In order to access correctly the elements from the transposed matrix B, we need to pass
+        //        the correct step which is calculated as (16 * mult_transpose1xW_width) / 4)
+
         build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
-        kernel_name = "gemmlowp_mm_interleaved_transposed";
+        build_opts.add_option("-DTRANSPOSE1XW_WIDTH_STEP=" + support::cpp11::to_string(4 * mult_transpose1xW_width));
+        build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
+
+        kernel_name = "gemmlowp_mm_interleaved_transposed_" + string_from_target(arch_target);
     }
     else
     {
         build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
         build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x()));
         build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
-        kernel_name = "gemmlowp_mm";
+        kernel_name = "gemmlowp_mm_" + string_from_target(arch_target);
     }
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
@@ -167,10 +229,10 @@
     _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
 }
 
-Status CLGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed)
+Status CLGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
 {
     ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
                                                               input1->clone().get(),
                                                               output->clone().get(),

diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
index d05939f..221a156 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -91,7 +91,7 @@
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row,
                                                         int32_t a_offset, int32_t b_offset)
 {
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
     bool                   window_changed                    = false;
 
     // Configure kernel window
@@ -160,6 +160,14 @@
                                                     a_offset, b_offset); // NOLINT
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "gemmlowp_offset_contribution_";
+    _config_id += support::cpp11::to_string(mm_result->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
 }
 
 Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,

diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
index 1499df0..3fe956d 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,6 +36,42 @@
 
 using namespace arm_compute;
 
+namespace
+{
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->data_type());
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const float beta)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != output->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != output->dimension(1));
+
+    ARM_COMPUTE_UNUSED(beta);
+    return Status{};
+}
+} // namespace
+
 CLGEMMMatrixAdditionKernel::CLGEMMMatrixAdditionKernel()
     : _input(nullptr), _output(nullptr)
 {
@@ -43,14 +79,13 @@
 
 void CLGEMMMatrixAdditionKernel::configure(const ICLTensor *input, ICLTensor *output, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    _input                                               = input;
-    _output                                              = output;
-    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->info()->data_type());
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), beta));
+
+    _input  = input;
+    _output = output;
 
     std::ostringstream ma_arguments;
     if(is_data_type_fixed_point(input->info()->data_type()))
@@ -74,16 +109,15 @@
     _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_ma_" + data_type_name), build_opts));
 
     // Configure kernel window
-    Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+}
 
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    ICLKernel::configure(win);
+Status CLGEMMMatrixAdditionKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const float beta)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(validate_arguments(input, output, beta));
+    return Status{};
 }
 
 void CLGEMMMatrixAdditionKernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 19f38bf..6c31e37 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,24 +36,69 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <set>
 #include <string>
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
 using ElementsProcessed = Steps;
 
-inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed)
+inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1);
+
     if(!is_interleaved_transposed)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+
+        if(output->total_size() != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));
+            ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
+        }
+    }
+    else
+    {
+        const int m                         = reshape_info.m();
+        const int n                         = reshape_info.n();
+        const int k                         = reshape_info.k();
+        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
+        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+
+        TensorShape tensor_shape0{ input0->tensor_shape() };
+        tensor_shape0.set(0, k);
+        tensor_shape0.set(1, m);
+
+        TensorShape tensor_shape1{ input1->tensor_shape() };
+        tensor_shape1.set(0, n);
+        tensor_shape1.set(1, k);
+
+        const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
+        const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
+
+        const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
+        const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+        if(output->total_size() != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n));
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
+        }
     }
 
     return Status{};
@@ -122,12 +167,19 @@
 {
 }
 
-void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed)
+void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
 
+    // Output tensor auto inizialitation if not yet initialized
+    TensorShape tensor_shape{ input0->info()->tensor_shape() };
+    tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->info()->dimension(0));
+    tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->info()->dimension(1));
+
+    auto_init_if_empty(*output->info(), input0->info()->clone()->set_tensor_shape(tensor_shape));
+
     // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
 
     _input0 = input0;
     _input1 = input1;
@@ -176,7 +228,13 @@
     std::string kernel_name;
     if(is_interleaved_transposed)
     {
+        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
+        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+
         build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
+        build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
+        build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
+
         if(data_type == DataType::F32)
         {
             kernel_name = "gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target);
@@ -230,11 +288,13 @@
     _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
 }
 
-Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, GPUTarget gpu_target)
+Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed,
+                                            const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target)
 {
+    // Note: num_elements_processed will be set in validate_and_configure_window()
     ElementsProcessed num_elements_processed{};
     ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
                                                               input1->clone().get(),
                                                               output->clone().get(),

diff --git a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
index 951bc14..cc483dc 100644
--- a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,23 +45,35 @@
 
 void CLGEMMMatrixVectorMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input0->info()->data_type()) && (output->info()->data_type() != DataType::S32));
     ARM_COMPUTE_ERROR_ON(input0->info()->dimension(2) != input1->info()->dimension(1));
 
     _input0 = input0;
     _input1 = input1;
     _output = output;
 
+    // Check if is a quantized operation
+    bool is_quantized = is_data_type_quantized_asymmetric(_input0->info()->data_type());
+
     // Create kernel
-    std::set<std::string> build_opts;
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(!is_quantized, "-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input0->info()->dimension(0)));
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input0->info()->dimension(1)));
 
-    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
-    build_opts.emplace("-DSRC_WIDTH=" + support::cpp11::to_string(input0->info()->dimension(0)));
-    build_opts.emplace("-DSRC_HEIGHT=" + support::cpp11::to_string(input0->info()->dimension(1)));
+    std::string kernel_name = is_quantized ? std::string("gemm_mv_quantized") : std::string("gemm_mv");
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mv", build_opts));
+    // Add static arguments
+    if(is_quantized)
+    {
+        unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor() + num_arguments_per_1D_tensor();
+        _kernel.setArg<int>(idx++, -_input0->info()->quantization_info().offset);
+        _kernel.setArg<int>(idx++, -_input1->info()->quantization_info().offset);
+    }
 
     // Configure the local work size for Bifrost with a value obtained
     // via exhaustive autotuning for the MobileNets tensor shapes.

diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index 69a545b..24d2187 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,8 +42,9 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON(mult_transpose1xW_width < 1);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
                                                          DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
@@ -51,7 +52,7 @@
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
-                                                           compute_transpose1xW_with_element_size_shape(*input));
+                                                           compute_transpose1xW_with_element_size_shape(*input, mult_transpose1xW_width));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     }
@@ -59,11 +60,11 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration, int mult_transpose1xW_width)
 {
     num_elems_processed_per_iteration = 16 / input->element_size();
 
-    const int scale_x        = num_elems_processed_per_iteration;
+    const int scale_x        = num_elems_processed_per_iteration * mult_transpose1xW_width;
     bool      window_changed = false;
 
     // Configure kernel window
@@ -90,25 +91,32 @@
 }
 } // namespace
 
-void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output)
+void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output, int mult_transpose1xW_width)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input->info())));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input->info(), mult_transpose1xW_width)));
 
     // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mult_transpose1xW_width));
 
     _input  = input;
     _output = output;
 
     // Configure kernel window
+    // Note: num_elems_processed_per_iteration will be set in validate_and_configure_window()
     unsigned int num_elems_processed_per_iteration = 1;
-    auto         win_config                        = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration);
+    auto         win_config                        = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration, mult_transpose1xW_width);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure(win_config.second);
 
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
+    build_opts.add_option("-DTRANSPOSE_W=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
+
     /*
      * Following an example of how the transposition1xW works when the input data type is F32
      *
@@ -117,18 +125,18 @@
      *         |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
      *         |a30 a31 a32 a33|
      *
-     * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+     * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) * mult_transpose1xW_width
      */
     // Create kernel
-    std::string kernel_name = "gemm_transpose1x" + support::cpp11::to_string(num_elems_processed_per_iteration);
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+    std::string kernel_name = "gemm_transpose1xW";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 }
 
-Status CLGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+Status CLGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width)
 {
     unsigned int num_elems_processed_per_iteration = 1;
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mult_transpose1xW_width));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration, mult_transpose1xW_width).first);
 
     return Status{};
 }

diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 0e9f2c5..b75d264 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,9 +41,10 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, bool has_bias)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
 
     // Checks performed when output is configured
@@ -58,7 +59,7 @@
 } // namespace
 
 CLIm2ColKernel::CLIm2ColKernel()
-    : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr)
+    : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr), _kernel_dims()
 {
 }
 
@@ -67,10 +68,11 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), has_bias));
 
-    _input  = input;
-    _output = output;
+    _input       = input;
+    _output      = output;
+    _kernel_dims = kernel_dims;
 
     const DataType  data_type  = input->info()->data_type();
     const GPUTarget gpu_target = get_arch_from_target(get_target());
@@ -78,6 +80,7 @@
     // Create kernel
     CLBuildOptions build_opts;
     build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
+    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
     build_opts.add_option_if(has_bias, "-DHAS_BIAS");
     build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
 
@@ -92,13 +95,19 @@
                                                     output->info()->tensor_shape().cbegin() + 1))
                                      && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding());
 
-    std::string kernel_name = "im2col_generic";
+    bool is_optimized_path = false;
+
+    _num_elems_processed_per_iteration = 1;
+
+    std::string kernel_name;
     if(!run_img2col_reduced)
     {
+        // Default kernel name
+        kernel_name = "im2col_generic_dchw";
+
         _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
                                             kernel_dims.width, kernel_dims.height,
                                             conv_info);
-        _num_elems_processed_per_iteration = output->info()->dimension(0);
 
         build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
         build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
@@ -115,19 +124,53 @@
         build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
         build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(input->info()->quantization_info().offset), "-DPAD_VALUE=0");
 
-        if(kernel_dims.width == 3 && kernel_dims.height == 3 && !conv_info.has_padding())
-        {
-            kernel_name = "im2col_kernel3x3_padx0_pady0";
+        const bool squared_im2col = kernel_dims.width == kernel_dims.height;
 
-            // Local work size optimized for the 3x3 MobileNets convolution on Bifrost.
-            if(gpu_target == GPUTarget::BIFROST && input->info()->dimension(0) == 224)
+        if(squared_im2col && !is_data_type_fixed_point(data_type))
+        {
+            // Check if we can run an optimized im2col
+            switch(kernel_dims.width)
             {
-                _lws_hint = cl::NDRange(2, 3, 3);
+                case 1:
+                    // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
+                    if(conv_info.stride().first == 1 && !conv_info.has_padding())
+                    {
+                        // Set hint for LWS
+                        _lws_hint                          = cl::NDRange(1, 1, 8);
+                        _num_elems_processed_per_iteration = 4;
+                        is_optimized_path                  = true;
+                        kernel_name                        = "im2col1x1_stridex1_dchw";
+                    }
+                    break;
+                case 3:
+                    _lws_hint                          = cl::NDRange(1, 1, 8);
+                    _num_elems_processed_per_iteration = 1;
+                    is_optimized_path                  = true;
+                    kernel_name                        = "im2col3x3_dchw";
+                    break;
+                case 5:
+                    _num_elems_processed_per_iteration = 1;
+                    is_optimized_path                  = true;
+                    kernel_name                        = "im2col5x5_dchw";
+                    break;
+                case 11:
+                    // Optimized im2col11x11 if pad_x = pad_y = 0
+                    if(!conv_info.has_padding())
+                    {
+                        _num_elems_processed_per_iteration = 1;
+                        is_optimized_path                  = true;
+                        kernel_name                        = "im2col11x11_padx0_pady0_dchw";
+                    }
+                    break;
+                default:
+                    is_optimized_path = false;
+                    break;
             }
         }
         else if(kernel_dims.width > 1 && !conv_info.has_padding())
         {
-            kernel_name = "im2col_generic_padx0_pady0";
+            _num_elems_processed_per_iteration = 1;
+            kernel_name                        = "im2col_generic_padx0_pady0_dchw";
 
             // Optimized im2col is performed using one or more vector operations with the specified vector size
             // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
@@ -151,30 +194,12 @@
             build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
             build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
         }
-        else
-        {
-            if(gpu_target == GPUTarget::BIFROST)
-            {
-                const size_t input_channels = input->info()->dimension(2);
-                if((input_channels & (input_channels - 1)) == 0)
-                {
-                    // input_channels is a power of two
-                    _lws_hint = cl::NDRange(1, 1, 4);
-                }
-                else if(input_channels < 192 && (input_channels % 4) == 0)
-                {
-                    // input_channels is less than 192 and is a multiple of 4
-                    _lws_hint = cl::NDRange(1, 1, 2);
-                }
-                // otherwise the default is optimal
-            }
-        }
         _run_func = &CLIm2ColKernel::run_generic;
     }
     else
     {
-        kernel_name                        = "im2col_reduced";
         _num_elems_processed_per_iteration = 1;
+        kernel_name                        = "im2col_reduced_dchw";
         _run_func                          = &CLIm2ColKernel::run_reduced;
     }
 
@@ -182,8 +207,30 @@
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Configure  kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
-    // The CLIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
+    Window win;
+    if(is_optimized_path)
+    {
+        win = calculate_max_window(*input->info(),
+                                   Steps(_num_elems_processed_per_iteration),
+                                   false,
+                                   BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left()));
+
+        const int x = -conv_info.pad_left();
+        const int y = -conv_info.pad_top();
+        const int w = kernel_dims.width * _num_elems_processed_per_iteration;
+        const int h = kernel_dims.height;
+
+        AccessWindowRectangle input_access(input->info(), x, y, w, h);
+
+        update_window_and_padding(win, input_access);
+    }
+    else
+    {
+        // For the generic case, CLIm2ColKernel doesn't need padding (we do not read out-of-bounds elements) so
+        // update_window_and_padding() can be skipped
+        win = calculate_max_window(*input->info(), Steps());
+    }
+
     output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
     if(!run_img2col_reduced)
     {
@@ -194,8 +241,8 @@
     ICLKernel::configure(win);
 
     // Set config_id for enabling LWS tuning
-    _config_id = "im2col_";
-    _config_id += (run_img2col_reduced ? "reduced_" : "");
+    _config_id = kernel_name;
+    _config_id += "_";
     _config_id += lower_string(string_from_data_type(input->info()->data_type()));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(0));
@@ -208,7 +255,7 @@
     ARM_COMPUTE_UNUSED(kernel_dims);
     ARM_COMPUTE_UNUSED(conv_info);
     ARM_COMPUTE_UNUSED(has_bias);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, has_bias));
     return Status{};
 }
 
@@ -232,9 +279,15 @@
     Window slice_in  = window_collapsed.first_slice_window_3D();
     Window slice_out = window_collapsed.first_slice_window_3D();
 
-    // Setup slice
-    slice.set(Window::DimX, Window::Dimension(0, static_cast<int>(_convolved_dims.first), 1));
-    slice.set(Window::DimY, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
+    // Setup slice if stride_x != 0 or stride_y != 0
+    if(_convolved_dims.first != _input->info()->dimension(0) || _convolved_dims.second != _input->info()->dimension(1))
+    {
+        // If the stride_x or stride_y are not 1, the output tensor of matrix multiply (Convolved tensor) will not
+        // have the same shape of the im2col input tensor
+        // In this case we need to re-compute the window using the shape of the tensor after matrix multiply (convolved_dims)
+        slice.set(Window::DimX, Window::Dimension(0, static_cast<int>(_convolved_dims.first), 1));
+        slice.set(Window::DimY, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
+    }
 
     // Setup input slice
     // The first three dimensions of the input are increased by the inner loops
@@ -243,7 +296,7 @@
     slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
     // Setup output slice
-    slice_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _num_elems_processed_per_iteration));
+    slice_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _kernel_dims.area()));
     slice_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1));
     slice_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
 

diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
index 9b4533b..8ba1f77 100644
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -89,7 +89,6 @@
     Window window_output;
     window_output.use_tensor_dimensions(_output->info()->tensor_shape());
     window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_output.collapse_if_possible(ICLKernel::window(), 1);
 
     Iterator output(_output, window_output);
 
@@ -110,27 +109,21 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    // Collapse min/max batches
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
     Window slice            = window_collapsed.first_slice_window_3D();
     slice.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice.set(Window::DimY, Window::Dimension(0, 1, 1));
     slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
 
-    Window window_output;
-    window_output.use_tensor_dimensions(_output->info()->tensor_shape());
-    window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_output.collapse_if_possible(ICLKernel::window(), 1);
-
-    Window output_slice = window_output.first_slice_window_1D();
-
     do
     {
+        Window output_slice = slice.shift_dimensions(2);
+
         unsigned int idx = 0;
         // Set inputs
         add_3D_tensor_argument(idx, _input, slice);
         add_1D_tensor_argument(idx, _output, output_slice);
         enqueue(queue, *this, slice);
     }
-    while(window.slide_window_slice_3D(slice) && window_output.slide_window_slice_1D(output_slice));
+    while(window_collapsed.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
index 132de60..12c2d58 100644
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp

@@ -30,6 +30,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
@@ -46,20 +47,35 @@
     permute(output_shape, perm);
     return output_shape;
 }
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16, DataType::QS16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->num_dimensions() < 3), "Invalid input size!");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((perm.num_dimensions() == 3 && !(perm[0] == 2 && perm[1] == 0 && perm[2] == 1) && !(perm[0] == 1 && perm[1] == 2 && perm[2] == 0)) || (perm.num_dimensions() == 4
+                                     && !(perm[0] == 3 && perm[1] == 2 && perm[2] == 0 && perm[3] == 1))),
+                                    "Only [2, 0, 1],[1, 2, 0] and [3, 2, 0, 1] permutation is supported");
+
+    const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
+
+    // Validate configured output
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+    return Status{};
+}
 } // namespace
 
 void CLPermuteKernel::configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
-                                                  DataType::U16, DataType::S16, DataType::QS16,
-                                                  DataType::U32, DataType::S32,
-                                                  DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->num_dimensions() < 3, "Invalid input size!");
-    ARM_COMPUTE_ERROR_ON_MSG(
-        (perm.num_dimensions() != 3 && ((perm[0] != 2 && perm[1] != 0 && perm[2] != 1) || (perm[0] != 1 && perm[1] != 2 && perm[2] != 0))) && (perm.num_dimensions() != 4 && ((perm[0] != 2 && perm[1] != 0
-                && perm[2] != 1)
-                || (perm[0] != 1 && perm[1] != 2 && perm[2] != 0))),
-        "Only [2, 0, 1],[1, 2, 0] and [3, 2, 0, 1] permutation is supported");
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm));
 
     _input  = input;
     _output = output;
@@ -101,15 +117,23 @@
     ICLKernel::configure(win);
 }
 
+Status CLPermuteKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(validate_arguments(input, output, perm));
+
+    return Status{};
+}
+
 void CLPermuteKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
-    Window slice_in = window.first_slice_window_4D();
-    Window slice_out(slice_in);
+    Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
 
     // Setup output slice
+    Window slice_out(slice_in);
     slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
     slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
     slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
@@ -117,12 +141,10 @@
 
     do
     {
-        auto         collapsed_slice_in  = slice_in.collapse(ICLKernel::window(), 2);
-        auto         collapsed_slice_out = slice_out.collapse(ICLKernel::window(), 2);
-        unsigned int idx                 = 0;
-        add_4D_tensor_argument(idx, _input, collapsed_slice_in);
-        add_4D_tensor_argument(idx, _output, collapsed_slice_out);
-        enqueue(queue, *this, collapsed_slice_in);
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, _input, slice_in);
+        add_4D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_in);
     }
     while(window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
 }

diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index 6dba9c0..f30ba61 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,6 +42,8 @@
 
 namespace
 {
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
 Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
@@ -50,10 +52,13 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
 
+    const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2);
+
     if(is_data_type_fixed_point(input1->data_type()))
     {
         // All data types must be all QS8 or all QS16
@@ -62,12 +67,12 @@
     }
 
     // Validate in case of configured output
-    if((output != nullptr) && (output->total_size() != 0))
+    if(output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
                                         "Output can only be U8 if both inputs are U8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output);
         if(is_data_type_fixed_point(input1->data_type()))
         {
@@ -80,18 +85,36 @@
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
 {
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
+    const TensorShape &out_shape    = broadcast_pair.first;
+    const ValidRegion &valid_region = broadcast_pair.second;
 
-    Window win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
+    // Auto initialize output if not initialized
+    {
+        set_shape_if_empty(*output, out_shape);
+
+        if(input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
+        {
+            set_format_if_unknown(*output, Format::S16);
+        }
+        else if(input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
+        {
+            set_format_if_unknown(*output, Format::F32);
+        }
+    }
+
+    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+    Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
+    Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
 
     AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
     AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
 
-    bool window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
+    bool window_changed = update_window_and_padding(win_input1, input1_access)
+                          || update_window_and_padding(win_input2, input2_access)
+                          || update_window_and_padding(win, output_access);
 
-    ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
-                                                       input2->valid_region());
     output_access.set_valid_region(win, valid_region);
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
@@ -108,24 +131,13 @@
                                                 ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-
-    // Auto initialize output if not initialized
-    {
-        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
-
-        if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
-        {
-            set_format_if_unknown(*output->info(), Format::S16);
-        }
-        else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
-        {
-            set_format_if_unknown(*output->info(), Format::F32);
-        }
-    }
-
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(),
                                                   scale, overflow_policy, rounding_policy));
 
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
     _input1 = input1;
     _input2 = input2;
     _output = output;
@@ -207,15 +219,13 @@
         _kernel.setArg(idx++, scale);
     }
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure(win_config.second);
 }
 
 Status CLPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
                                                  ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
 
@@ -227,16 +237,47 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
+    const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+    const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+    const TensorShape &out_shape = _output->info()->tensor_shape();
+
+    bool can_collapse = true;
+    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+    {
+        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+        {
+            can_collapse = (in_shape1[d] == in_shape2[d]);
+        }
+    }
+
+    bool   has_collapsed = false;
+    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+
+    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+    Window slice        = collapsed.first_slice_window_3D();
+    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
 
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input1, slice);
-        add_3D_tensor_argument(idx, _input2, slice);
+        add_3D_tensor_argument(idx, _input1, slice_input1);
+        add_3D_tensor_argument(idx, _input2, slice_input2);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice);
+
+        collapsed.slide_window_slice_3D(slice_input1);
+        collapsed.slide_window_slice_3D(slice_input2);
     }
     while(collapsed.slide_window_slice_3D(slice));
 }
+
+BorderSize CLPixelWiseMultiplicationKernel::border_size() const
+{
+    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+    return BorderSize(0, border, 0, 0);
+}

diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 860cc92..b3034e1 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp

@@ -63,12 +63,8 @@
                                     "Unsupported combination of parameters!");
 
     const bool         is_global_pooling = pool_info.is_global_pooling();
-    const unsigned int pool_size         = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size();
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()),
-                                    "Global pooling is supported only with rectangular inputs!");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_global_pooling && ((pool_info.pad_stride_info().pad().first >= pool_size) || (pool_info.pad_stride_info().pad().second >= pool_size)),
-                                    "Invalid pool size and pool pad combination!");
+    const unsigned int pool_size_x       = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size().width;
+    const unsigned int pool_size_y       = is_global_pooling ? input->tensor_shape().y() : pool_info.pool_size().height;
 
     // Checks performed when output is configured
     if(output->total_size() != 0)
@@ -80,8 +76,8 @@
         unsigned int pooled_h = 0;
         std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
                                                          input->dimension(1),
-                                                         pool_size,
-                                                         pool_size,
+                                                         pool_size_x,
+                                                         pool_size_y,
                                                          pool_info.pad_stride_info());
         ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h),
                                         "Invalid output pooling dimensions!");
@@ -92,32 +88,31 @@
 
 std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info)
 {
-    int                 pool_pad_x      = 0;
-    int                 pool_pad_y      = 0;
     int                 pool_stride_x   = 0;
     int                 pool_stride_y   = 0;
     unsigned int        pooled_w        = 0;
     unsigned int        pooled_h        = 0;
-    int                 pool_size       = pool_info.pool_size();
+    int                 pool_size_x     = pool_info.is_global_pooling() ? input->dimension(0) : pool_info.pool_size().width;
+    int                 pool_size_y     = pool_info.is_global_pooling() ? input->dimension(1) : pool_info.pool_size().height;
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
-    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+    const int pool_pad_right  = pad_stride_info.pad_right();
+    const int pool_pad_top    = pad_stride_info.pad_top();
+    const int pool_pad_left   = pad_stride_info.pad_left();
+    const int pool_pad_bottom = pad_stride_info.pad_bottom();
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    // Update pool size in case of global pooling
-    pool_size = pool_info.is_global_pooling() ? input->dimension(0) : pool_size;
-
     // Check output dimensions
     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
                                                      input->dimension(1),
-                                                     pool_size,
-                                                     pool_size,
+                                                     pool_size_x,
+                                                     pool_size_y,
                                                      pad_stride_info);
 
     auto_init(input, output, pooled_w, pooled_h);
 
-    BorderSize     border_size = BorderSize(pool_pad_y, pool_pad_x);
+    BorderSize     border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
     const DataType data_type   = input->data_type();
 
     const int input_width  = input->dimension(0);
@@ -125,23 +120,23 @@
 
     // Change the number of elements processed per iteration
     // for pooling 3x3 with stride less equal than 3
-    const bool         can_optimize                      = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
+    const bool         can_optimize                      = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
     const unsigned int num_elems_processed_per_iteration = can_optimize ? 4 : 1;
-    const int          num_elems_read_per_iteration      = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size;
+    const int          num_elems_read_per_iteration      = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size_x;
 
     // Number of iterations in X dimension
     const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
 
     // Upper limit for the number of right/bottom border elements that are accessed
-    const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
-    const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+    const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
+    const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
 
-    border_size.right  = std::max(upper_bound_w, pool_pad_x);
-    border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+    border_size.right  = std::max(upper_bound_w, pool_pad_right);
+    border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
 
     Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
 
-    AccessWindowRectangle input_access(input, -pool_pad_x, -pool_pad_y, num_elems_read_per_iteration, pool_size,
+    AccessWindowRectangle input_access(input, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y,
                                        pool_stride_x * num_elems_processed_per_iteration, pool_stride_y);
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
     bool                   window_changed = update_window_and_padding(win, input_access, output_access);
@@ -164,29 +159,26 @@
 
 void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
 {
-    int                 pool_pad_x      = 0;
-    int                 pool_pad_y      = 0;
     int                 pool_stride_x   = 0;
     int                 pool_stride_y   = 0;
     unsigned int        pooled_w        = 0;
     unsigned int        pooled_h        = 0;
     const PoolingType   pool_type       = pool_info.pool_type();
-    int                 pool_size       = pool_info.pool_size();
+    const int           pool_size_x     = pool_info.is_global_pooling() ? input->info()->dimension(0) : pool_info.pool_size().width;
+    const int           pool_size_y     = pool_info.is_global_pooling() ? input->info()->dimension(1) : pool_info.pool_size().height;
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
     const bool          exclude_padding = pool_info.exclude_padding();
-    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+    const int pool_pad_top  = pad_stride_info.pad_top();
+    const int pool_pad_left = pad_stride_info.pad_left();
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    // Update pool size in case of global pooling
-    pool_size = pool_info.is_global_pooling() ? input->info()->dimension(0) : pool_size;
-
     // Check output dimensions
     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
                                                      input->info()->dimension(1),
-                                                     pool_size,
-                                                     pool_size,
+                                                     pool_size_x,
+                                                     pool_size_y,
                                                      pad_stride_info);
 
     auto_init(input->info(), output->info(), pooled_w, pooled_h);
@@ -211,30 +203,31 @@
     if(pool_type != PoolingType::MAX)
     {
         build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
-        build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x)));
-        build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y)));
+        build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_left)));
+        build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_top)));
         build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
-        build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_x));
-        build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_y));
+        build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left));
+        build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
     }
 
     // Create kernel
-    if((pool_size == 3) && !is_data_type_quantized_asymmetric(data_type))
+    if((pool_size_x == 3) && (pool_size_y == 3) && !is_data_type_quantized_asymmetric(data_type))
     {
         // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
         // each thread computes 4 output elements
-        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type);
+        const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type);
 
         std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_")
-                                  + support::cpp11::to_string(pool_size);
+                                  + support::cpp11::to_string(pool_size_x);
         _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
     }
     else // Run general case
     {
-        build_opts.add_option("-DPOOL_SIZE=" + support::cpp11::to_string(pool_size));
+        build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
+        build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
         build_opts.add_option_if(data_type == DataType::F16, "-DFP16");
 
-        std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_N_quantized" : "pooling_layer_N";
+        std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized" : "pooling_layer_MxN";
         _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
     }
 
@@ -281,8 +274,8 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    unsigned int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
-    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    unsigned int pool_stride_x = 0;
+    unsigned int pool_stride_y = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
 
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -292,11 +285,11 @@
     {
         // Upsample input by pool size
         Window in_slice(slice);
-        in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x,
-                                                     (in_slice.x().end() - pool_pad_x) * pool_stride_x,
+        in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - _pool_info.pad_stride_info().pad_left(),
+                                                     (in_slice.x().end() - _pool_info.pad_stride_info().pad_left()) * pool_stride_x,
                                                      pool_stride_x * _num_elems_processed_per_iteration));
-        in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y,
-                                                     (in_slice.y().end() - pool_pad_y) * pool_stride_y,
+        in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - _pool_info.pad_stride_info().pad_top(),
+                                                     (in_slice.y().end() - _pool_info.pad_stride_info().pad_top()) * pool_stride_y,
                                                      pool_stride_y));
 
         // Set inputs

diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
index 4756443..8b082a8 100644
--- a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -82,20 +82,16 @@
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
     Window slice            = window_collapsed.first_slice_window_3D();
 
-    Window window_min_max;
-    window_min_max.use_tensor_dimensions(_min_max->info()->tensor_shape());
-    window_min_max.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_min_max.collapse_if_possible(ICLKernel::window(), 1);
-
-    Window slice_min_max = window_min_max.first_slice_window_1D();
-
     do
     {
+        Window slice_min_max = slice.shift_dimensions(2);
+        slice_min_max.set(Window::DimX, Window::Dimension(0, 1, 1));
+
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         add_1D_tensor_argument(idx, _min_max, slice_min_max);
         enqueue(queue, *this, slice);
     }
-    while(window.slide_window_slice_3D(slice) && window_min_max.slide_window_slice_1D(slice_min_max));
+    while(window_collapsed.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index 4e000c6..a07a424 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -108,7 +108,7 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    Window slice(window);
+    Window slice = window.first_slice_window_3D();
     // Parallelize spatially and across the fourth dimension of the output tensor (also across ROIArray)
     slice.set(Window::DimZ, window[3]);
 

diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 18a8e35..1dd5eb9 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -118,8 +118,8 @@
     out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
 
     // Get first input and output slices
-    Window in_slice  = window.first_slice_window_1D();
-    Window out_slice = out_window.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_2D();
+    Window out_slice = out_window.first_slice_window_2D();
 
     // Reshape window
     const unsigned int border_width = ((in_slice.x().end() % 128) != 0) ? 128 - in_slice.x().end() % 128 : 0;
@@ -127,14 +127,14 @@
 
     // Set local sums buffer
     unsigned int local_sum_size = _lws_hint[0] * _input->info()->element_size();
-    _kernel.setArg(num_arguments_per_1D_tensor() * 2, local_sum_size, nullptr);
+    _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_sum_size, nullptr);
 
     do
     {
         unsigned int idx = 0;
-        add_1D_tensor_argument(idx, _input, in_slice);
-        add_1D_tensor_argument(idx, _output, out_slice);
+        add_2D_tensor_argument(idx, _input, in_slice);
+        add_2D_tensor_argument(idx, _output, out_slice);
         enqueue(queue, *this, in_slice, _lws_hint);
     }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice));
+    while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
 }

diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index 04a7639..447d6ee 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -79,33 +79,14 @@
     return build_opts;
 }
 
-// Arguments Validation
-
-Status validate_arguments_1DMax(const ITensorInfo *input, const ITensorInfo *output)
+Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        // Softmax across the x dimension
-        TensorShape output_shape{ input->tensor_shape() };
-        output_shape.set(0, 1);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-    }
-
-    return Status{};
-}
-
-Status validate_arguments_1DShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output);
 
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, max);
+
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->data_type());
 
     // Checks performed when output is configured
@@ -141,33 +122,6 @@
     return Status{};
 }
 
-Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, max);
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
-    }
-
-    // Checks performed when sum is configured
-    if(sum->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(max, sum);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(max, sum);
-    }
-
-    return Status{};
-}
-
 Status validate_arguments_1DNorm(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::S32, DataType::F16, DataType::F32);
@@ -200,58 +154,6 @@
 
 // Window validation
 
-std::pair<Status, Window> validate_and_configure_window_1DMax(ITensorInfo *input, ITensorInfo *output)
-{
-    TensorShape output_shape{ input->tensor_shape() };
-    output_shape.set(0, 1);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
-
-    // The kernel loops over all elements in steps of 16
-    const unsigned int     num_elems_processed_per_iteration = ceil_to_multiple(input->dimension(0), 16);
-    constexpr unsigned int num_elems_written_per_iteration   = 1;
-
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-std::pair<Status, Window> validate_and_configure_window_1DShiftExpSum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum)
-{
-    const bool     is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->data_type());
-    const DataType tmp_data_type           = is_quantized_asymmetric ? DataType::S32 : input->data_type();
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*sum, max->clone()->set_data_type(tmp_data_type).set_fixed_point_position(input->fixed_point_position()));
-    auto_init_if_empty(*output, input->clone()->set_data_type(tmp_data_type));
-
-    // The kernel loops over all elements in steps of 16
-    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->dimension(0), 16);
-
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal max_access(max, 0, 1);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal sum_access(sum, 0, 1);
-
-    bool window_changed = update_window_and_padding(win, input_access, max_access, output_access, sum_access);
-
-    output_access.set_valid_region(win, input->valid_region());
-    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
 std::pair<Status, Window> validate_and_configure_window_1DMaxShiftExpSum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum)
 {
     // Output auto initialization if not yet initialized
@@ -305,146 +207,6 @@
 
 } // namespace
 
-void CLLogits1DMaxKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    TensorShape output_shape{ input->info()->tensor_shape() };
-    output_shape.set(0, 1);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DMax(input->info(), output->info()));
-
-    _input  = input;
-    _output = output;
-
-    const DataType data_type = input->info()->data_type();
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option_if(is_data_type_fixed_point(data_type),
-                             "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
-    build_opts.add_option_if(data_type == DataType::F16, "-DUSE_F16");
-    // Tell the kernel that the width is not a multiple of 16
-    build_opts.add_option_if((input->info()->dimension(0) % max_cl_vector_width) != 0, "-DNON_MULTIPLE_OF_16");
-
-    // Create kernel
-    std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "softmax_layer_max_quantized" : "softmax_layer_max";
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
-    // Set fixed arguments
-    unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
-    _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window_1DMax(input->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure(win_config.second);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "softmax_layer_";
-    _config_id += lower_string(string_from_data_type(data_type));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-}
-
-Status CLLogits1DMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMax(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DMax(input->clone().get(), output->clone().get()).first);
-
-    return Status{};
-}
-
-CLLogits1DShiftExpSumKernel::CLLogits1DShiftExpSumKernel()
-    : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
-{
-}
-
-void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
-
-    const bool     is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->info()->data_type());
-    const DataType tmp_data_type           = is_quantized_asymmetric ? DataType::S32 : input->info()->data_type();
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*sum->info(), max->info()->clone()->set_data_type(tmp_data_type).set_fixed_point_position(input->info()->fixed_point_position()));
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(tmp_data_type));
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DShiftExpSum(input->info(), max->info(), output->info(), sum->info()));
-
-    _input  = input;
-    _max    = max;
-    _output = output;
-    _sum    = sum;
-
-    const DataType dt       = input->info()->data_type();
-    auto           beta_int = static_cast<int>(lround(beta * (1 << input->info()->fixed_point_position())));
-
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
-    build_opts.add_option_if(is_data_type_fixed_point(dt),
-                             std::string("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
-    build_opts.add_option_if(dt == DataType::F16, std::string("-DUSE_F16"));
-    // Tell the kernel that the width is not a multiple of 16
-    build_opts.add_option_if((input->info()->dimension(0) % max_cl_vector_width) != 0, std::string("-DNON_MULTIPLE_OF_16"));
-    build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), std::string("-DBETA=" + support::cpp11::to_string(beta_int)));
-    build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), std::string("-DBETA=" + float_to_string_with_full_precision(beta)));
-    build_opts.add_options_if(is_quantized_asymmetric,
-                              prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options());
-
-    // Create kernel
-    std::string kernel_name = is_quantized_asymmetric ? "softmax_layer_shift_exp_sum_quantized" : "softmax_layer_shift_exp_sum";
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
-    // Set fixed arguments
-    unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
-    _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
-
-    // Configure window
-    auto win_config = validate_and_configure_window_1DShiftExpSum(input->info(), max->info(), output->info(), sum->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure(win_config.second);
-}
-
-Status CLLogits1DShiftExpSumKernel::validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DShiftExpSum(input, max, output, sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DShiftExpSum(input->clone().get(), max->clone().get(), output->clone().get(), sum->clone().get()).first);
-
-    return Status{};
-}
-
-void CLLogits1DShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice            = window_collapsed.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        // Set inputs
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _max, slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        add_3D_tensor_argument(idx, _sum, slice);
-        enqueue(queue, *this, slice, _lws_hint);
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
-
 /**< Grid size (obtained through auto-tuning) */
 const unsigned int CLLogits1DMaxShiftExpSumKernel::_grid_size = 64;
 /**< Vector size in the serial case (obtained through auto-tuning) */
@@ -485,9 +247,11 @@
     build_opts.add_option_if(dt == DataType::F16, "-DUSE_F16");
     build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), "-DBETA=" + support::cpp11::to_string(beta_int));
     build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
+    build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options());
 
-    _lws_hint                                     = cl::NullRange;
-    std::string           kernel_name             = std::string("softmax_layer_max_shift_exp_sum_serial");
+    _lws_hint               = cl::NullRange;
+    std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("softmax_layer_max_shift_exp_sum_quantized_serial") :
+                              std::string("softmax_layer_max_shift_exp_sum_serial");
     ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size);
     unsigned int          vector_size             = std::get<1>(parallel_reduction_info);
 
@@ -498,7 +262,7 @@
     // Configure parallel kernel if needed
     if(std::get<0>(parallel_reduction_info))
     {
-        kernel_name            = std::string("softmax_layer_max_shift_exp_sum_parallel");
+        kernel_name            = is_data_type_quantized_asymmetric(dt) ? std::string("softmax_layer_max_shift_exp_sum_quantized_parallel") : std::string("softmax_layer_max_shift_exp_sum_parallel");
         bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0);
         build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size));
 

diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
index 3a9a32e..f5eaa5a 100644
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,8 +31,41 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3]));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
+    }
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
 
 CLWeightsReshapeKernel::CLWeightsReshapeKernel()
     : _input(nullptr), _biases(nullptr), _output(nullptr)
@@ -41,35 +74,17 @@
 
 void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    const DataType data_type = input->info()->data_type();
-
-    // Calculate output shape
-    TensorShape output_shape{ input->info()->tensor_shape() };
-    output_shape.collapse(3);
-    const size_t tmp_dim = output_shape[0];
-    output_shape.set(0, output_shape[1]);
-    output_shape.set(1, tmp_dim + (biases != nullptr ? 1 : 0));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_weights_reshaped_shape(*input->info(), (biases != nullptr))));
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
+                                                  (biases != nullptr) ? biases->info() : nullptr,
+                                                  output->info()));
 
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(data_type));
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
-        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (biases->info()->num_dimensions() != 1));
-        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (biases->info()->num_dimensions() != 2));
-        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (biases->info()->dimension(0) != input->info()->tensor_shape()[3]));
-        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (biases->info()->dimension(0) != input->info()->tensor_shape()[3] || biases->info()->dimension(1) != input->info()->tensor_shape()[4]));
-    }
+    const DataType data_type = input->info()->data_type();
 
     _biases = biases;
     _output = output;
@@ -99,6 +114,12 @@
     ICLKernel::configure(win);
 }
 
+Status CLWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, biases, output));
+    return Status{};
+}
+
 void CLWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

diff --git a/src/core/CPP/kernels/CPPPermuteKernel.cpp b/src/core/CPP/kernels/CPPPermuteKernel.cpp
index 298c700..5c93f3e 100644
--- a/src/core/CPP/kernels/CPPPermuteKernel.cpp
+++ b/src/core/CPP/kernels/CPPPermuteKernel.cpp

@@ -44,12 +44,7 @@
                                                          DataType::U16, DataType::S16, DataType::QS16,
                                                          DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 3, "Invalid input size!");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        (perm.num_dimensions() != 3 && ((perm[0] != 2 && perm[1] != 0 && perm[2] != 1) || (perm[0] != 1 && perm[1] != 2 && perm[2] != 0))) && (perm.num_dimensions() != 4 && ((perm[0] != 2 && perm[1] != 0
-                && perm[2] != 1)
-                || (perm[0] != 1 && perm[1] != 2 && perm[2] != 0))),
-        "Only [2, 0, 1],[1, 2, 0] and [3, 2, 0, 1] permutation is supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() > 4, "Only up to 4D permutation vectors are supported");
 
     const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
 
@@ -70,7 +65,8 @@
     const auto old_dim = utility::make_array<Dimensions<T>::num_max_dimensions>(dimensions.begin(), dimensions.end());
     for(unsigned int i = 0; i < perm.num_dimensions(); ++i)
     {
-        dimensions[perm[i]] = old_dim[i];
+        T dimension_val = old_dim[i];
+        dimensions.set(perm[i], dimension_val);
     }
 }
 
@@ -79,20 +75,23 @@
 template <typename T>
 void CPPPermuteKernel::run_permute(const Window &window)
 {
+    // Permute strides
     Strides strides      = _output->info()->strides_in_bytes();
     Strides perm_strides = strides;
     permute_strides(perm_strides, _perm);
-    const int               output_stride_w = strides[3];
+
+    // Create output window
     Window                  window_out(window);
     const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
     for(size_t d = 0; d <= _perm.num_dimensions(); ++d)
     {
         window_out.set(d, zero_window);
     }
+
     // Create iterators
     Iterator in(_input, window);
     Iterator out(_output, window_out);
-    ARM_COMPUTE_ERROR_ON(_perm.num_dimensions() > _input->info()->num_dimensions());
+
     if(_input->info()->num_dimensions() <= 3)
     {
         execute_window_loop(window, [&](const Coordinates & id)
@@ -104,26 +103,12 @@
     }
     else if(_input->info()->num_dimensions() >= 4)
     {
-        if(_perm.num_dimensions() < _input->info()->num_dimensions())
+        execute_window_loop(window, [&](const Coordinates & id)
         {
-            // special case: perm.size = 3 and tensor size > 3, _perm[3] would be invalid so we handle this with id[3] * output_stride_w instead of id[_perm[3]]
-            ARM_COMPUTE_ERROR_ON(_perm.num_dimensions() < 3);
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const int idx                             = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * output_stride_w;
-                *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
-            },
-            in, out);
-        }
-        else
-        {
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const int idx                             = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_strides[3];
-                *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
-            },
-            in, out);
-        }
+            const int idx                             = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_strides[3];
+            *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+        },
+        in, out);
     }
 }
 

diff --git a/src/core/Error.cpp b/src/core/Error.cpp
index b593c27..2f6a94b 100644
--- a/src/core/Error.cpp
+++ b/src/core/Error.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,7 +56,7 @@
     va_end(args);
     throw std::runtime_error(err.error_description());
 }
-void Status::internal_throw_on_error()
+void Status::internal_throw_on_error() const
 {
     throw std::runtime_error(_error_description);
 }

diff --git a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
index 0b9cd3f..d4ce388 100644
--- a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
+++ b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp

@@ -190,6 +190,7 @@
 const std::map<std::string, std::string> GCKernelLibrary::_shader_program_map =
 {
     { "absdiff", "absdiff.cs" },
+    { "tensorshift", "tensor_shift.cs" },
     { "direct_convolution1x1", "direct_convolution1x1.cs" },
     { "direct_convolution3x3", "direct_convolution3x3.cs" },
     { "direct_convolution5x5", "direct_convolution5x5.cs" },
@@ -235,6 +236,10 @@
 #include "./cs_shaders/absdiff.csembed"
     },
     {
+        "tensor_shift.cs",
+#include "./cs_shaders/tensor_shift.csembed"
+    },
+    {
         "convolution_layer.cs",
 #include "./cs_shaders/convolution_layer.csembed"
     },

diff --git a/src/core/GLES_COMPUTE/IGCTensor.cpp b/src/core/GLES_COMPUTE/IGCTensor.cpp
index 5576665..19af777 100644
--- a/src/core/GLES_COMPUTE/IGCTensor.cpp
+++ b/src/core/GLES_COMPUTE/IGCTensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,7 @@
 using namespace arm_compute;
 
 IGCTensor::IGCTensor()
-    : _mapping(nullptr)
+    : _mapping(nullptr), _needs_shifting(false)
 {
 }
 
@@ -52,3 +52,13 @@
 {
     return _mapping;
 }
+
+bool IGCTensor::needs_shifting() const
+{
+    return _needs_shifting;
+}
+
+void IGCTensor::set_needs_shifting(bool needs_shifting)
+{
+    _needs_shifting = needs_shifting;
+}

diff --git a/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs b/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs
index 0ff4360..2ab6d5e 100755
--- a/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,20 @@
 precision mediump float;
 #define ADD(x, y) (x) + (y)
 
-/** This function add two images.
+/** This function add two tensors.
  *
- * @param[in]  src1_ptr   Pointer to the first source image. Supported data types: F16
- * @param[in]  src1_attrs The attributes of the first source image
- * @param[in]  src2_ptr   Pointer to the second source image. Supported data types: Same as @p src1_ptr
- * @param[in]  src2_attrs The attributes of the second source image
- * @param[out] dst_ptr    Pointer to the destination image. Supported data types: Same as @p src1_ptr
- * @param[in]  dst_attrs  The attributes of the destination image
+ * @param[in]  src1_ptr   Pointer to the first source tensor. Supported data types: F16
+ * @param[in]  src1_attrs The attributes of the first source tensor
+ * @param[in]  src2_ptr   Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  src2_attrs The attributes of the second source tensor
+ * @param[out] dst_ptr    Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  dst_attrs  The attributes of the destination tensor
  */
 SHADER_PARAMS_DECLARATION
 {
-    ImageAttributes src1_attrs;
-    ImageAttributes src2_attrs;
-    ImageAttributes dst_attrs;
+    Tensor3DAttributes src1_attrs;
+    Tensor3DAttributes src2_attrs;
+    Tensor3DAttributes dst_attrs;
 };
 
 TENSOR_DECLARATION(1, src1Buffer, uvec4, src1_ptr, src1_shift, 4, readonly);
@@ -51,9 +51,9 @@
 
 void main(void)
 {
-    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR(src1_attrs, src1_shift);
-    ImageIterator src2_iter = CONVERT_TO_IMAGE_ITERATOR(src2_attrs, src2_shift);
-    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+    Tensor3DIterator src1_iter = CONVERT_TO_TENSOR3D_ITERATOR(src1_attrs, src1_shift);
+    Tensor3DIterator src2_iter = CONVERT_TO_TENSOR3D_ITERATOR(src2_attrs, src2_shift);
+    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
     vec4 tmp1[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src1_ptr, src1_iter);
     vec4 tmp2[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src2_ptr, src2_iter);
@@ -62,4 +62,4 @@
     addition[1] = ADD(tmp1[1], tmp2[1]);
 
     STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, addition);
-}
\ No newline at end of file
+}

diff --git a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
index 53fb515..7629b25 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,6 +36,16 @@
 #define INVSQRT_OP(a) inversesqrt((a))
 #define SQCVT_SAT(a) (a)
 
+#if defined(LU_BRELU)
+#define ACTIVATION_FUNC(x) min(max(x, float(B_VAL)), float(A_VAL))
+#elif defined(BRELU)
+#define ACTIVATION_FUNC(x) min(max(x, float(0)), float(A_VAL))
+#elif defined(RELU)
+#define ACTIVATION_FUNC(x) max(x, float(0))
+#else /* defined(FUSED_ACT) */
+#define ACTIVATION_FUNC(x) (x)
+#endif /* defined(FUSED_ACT) */
+
 /** Apply batch normalization.
  *
  * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
@@ -102,7 +112,7 @@
     gamma_param = LOAD(gamma_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(gamma_iter, current_slice * beta_attrs.stride_x));
     beta_param  = LOAD(beta_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(beta_iter, current_slice * beta_attrs.stride_x));
 
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, ACTIVATION_FUNC(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param)));
 }
 
 #elif defined(DATA_TYPE_FP16)
@@ -148,7 +158,7 @@
 
         gamma_param = unpacked_s[3].x;
         beta_param  = unpacked_s[4].x;
-        result      = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
+        result      = ACTIVATION_FUNC(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
 
         STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
     }
@@ -163,7 +173,7 @@
 
         gamma_param = unpacked_s[3].y;
         beta_param  = unpacked_s[4].y;
-        result      = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
+        result      = ACTIVATION_FUNC(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
 
         STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
     }
@@ -178,7 +188,7 @@
 
         gamma_param = unpacked_s[3].z;
         beta_param  = unpacked_s[4].z;
-        result      = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
+        result      = ACTIVATION_FUNC(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
 
         STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
     }
@@ -193,7 +203,7 @@
 
         gamma_param = unpacked_s[3].w;
         beta_param  = unpacked_s[4].w;
-        result      = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
+        result      = ACTIVATION_FUNC(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
 
         STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
     }

diff --git a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
index 0c8b5bf..69ac50b 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,8 +53,8 @@
     Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
     Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
-    float tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSETS_X, -OFFSETS_Y, 0));
-    STORE(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, OFFSETS_Z), tmp);
+    float tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSET_X, -OFFSET_Y, 0));
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp);
 }
 
 #elif defined(DATA_TYPE_FP16)
@@ -66,7 +66,7 @@
     Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
     Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
-    uvec2 tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSETS_X, -OFFSETS_Y, 0));
-    STORE(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, OFFSETS_Z), tmp);
+    uvec2 tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSET_X, -OFFSET_Y, 0));
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp);
 }
 #endif /*DATA_TYPE_FP16*/

diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
index 344d480..774173d 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs

@@ -586,8 +586,12 @@
         // even row
         if((pos.y + pos.z * height) % uint(2) == uint(0))
         {
-            tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
-            STORE(dst_ptr, tmp_out_offset, tmp);
+            // skip last element of each line to avoid write conflict except for last line
+            if((pos.x < (width / element_count)) || ((pos.y == gl_NumWorkGroups.y - 1u) && (pos.z == gl_NumWorkGroups.z - 1u)))
+            {
+                tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+                STORE(dst_ptr, tmp_out_offset, tmp);
+            }
         }
         else
         {
@@ -612,18 +616,18 @@
     {
         tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
         STORE(dst_ptr, tmp_out_offset, tmp);
+    }
 
 #ifdef HAS_BIAS
-        // If it is the last thread in the 3 dimensional workgroup
-        if(pos.x == (size.x - 1) && pos.y == (size.y - 1) && pos.z == (size.z - 1))
-        {
-            tmp_out_offset += (dst_attrs.stride_x >> dst_shift);
+    // If it is the last thread in the 3 dimensional workgroup
+    if(pos.x == (size.x - 1u) && pos.y == (size.y - 1u) && pos.z == (size.z - 1u))
+    {
+        tmp_out_offset += (dst_attrs.stride_x >> dst_shift);
 
-            mediump vec2 bias_vec = vec2(1.0f, 1.0f);
-            STORE_PACK2_HALF(dst_ptr, tmp_out_offset, bias_vec);
-        }
-#endif // HAS_BIAS
+        mediump vec2 bias_vec = vec2(1.0f, 1.0f);
+        STORE_PACK2_HALF(dst_ptr, tmp_out_offset, bias_vec);
     }
+#endif // HAS_BIAS
 }
 
 #else /* IM2COL_REDUCED_GENERIC */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/dropout.cs b/src/core/GLES_COMPUTE/cs_shaders/dropout.cs
index 8dc7f0a..f4c8cb9 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/dropout.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/dropout.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -82,7 +82,7 @@
 
 #ifdef DATA_TYPE_FP32
 TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, maskBuffer, float, mask_ptr, mask_shift, 2, );
+TENSOR_DECLARATION(2, maskBuffer, float, mask_ptr, mask_shift, 2, restrict);
 TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
 
 void main(void)
@@ -111,7 +111,7 @@
 
 #elif defined(DATA_TYPE_FP16)
 TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, maskBuffer, uint, mask_ptr, mask_shift, 2, );
+TENSOR_DECLARATION(2, maskBuffer, uint, mask_ptr, mask_shift, 2, restrict);
 TENSOR_DECLARATION(3, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
 
 void main(void)

diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
index 4c8730e..ba50721 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs

@@ -508,7 +508,7 @@
     vec4 acc3 = vec4(0.0f);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
-    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a - uint(2));
+    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) <= int(end_row_vec_a - uint(4));
         TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 2 * 2), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(2) * src1_attrs.stride_y))
     {
         vec2 a0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
@@ -548,7 +548,7 @@
         vec2 a1 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        vec  a2 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
+        vec2 a2 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
         vec2 a3 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
@@ -614,7 +614,7 @@
     vec4 acc3 = vec4(0.0f);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
-    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a - uint(16));
+    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) <= int(end_row_vec_a - uint(16));
         TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(8) * src0_attrs.stride_x), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
     {
         vec4 a0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
@@ -728,7 +728,7 @@
     acc[0] = vec4(0.0f);
     acc[1] = vec4(0.0f);
 
-    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a - uint(16));
+    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) <= int(end_row_vec_a - uint(16));
         TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(8) * src0_attrs.stride_x), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
     {
         vec4 a[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
@@ -822,7 +822,7 @@
     VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR(biases_attrs, biases_shift);
 
     vec4 u[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(accum_ptr, accum_iter);
-    vec4 v[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(biases_ptr, bias_iter);
+    vec4 v[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(biases_ptr, biases_iter);
 
     vec4 r[2];
     r[0] = u[0] + v[0];
@@ -881,7 +881,6 @@
     c30[0] = vec4(0.0f);
     c30[1] = vec4(0.0f);
 
-    // FIXME: loop unrolling really needed for GLES?
     for(; (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) <= (end_row_mtx_b - 16); TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 16), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, 32))
     {
         /* Load values from matrix A (interleaved) and matrix B (transposed) */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/scale.cs b/src/core/GLES_COMPUTE/cs_shaders/scale.cs
index b2689a2..b72c339 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/scale.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/scale.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,23 +29,23 @@
 // We DO have to use highp for DATA_TYPE_FP16 float here to calculate the coordinates of source tensor. float is highp by default, but we still write it down here to make it more clearly, and mediump is only used for src/dst tensor in shader body.
 precision highp float;
 
-/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel FP16.
+/** Performs an affine transformation on an tensor interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel FP16.
  *
  * @param[in]  src_ptr      Pointer to the source tensor. Supported data types: FP16.
  * @param[in]  src_attrs    The attributes of the source tensor
  * @param[out] dst_ptr      Pointer to the destination tensor. Supported data types: FP16. (Must be the same as the input)
  * @param[in]  dst_attrs    The attributes of the destination tensor
- * @param[in]  input_width  Input image width
- * @param[in]  input_height Input image height
+ * @param[in]  input_width  Input tensor width
+ * @param[in]  input_height Input tensor height
  * @param[in]  scale        The scale factor along x/y dimension
  */
 SHADER_PARAMS_DECLARATION
 {
-    ImageAttributes src_attrs;
-    ImageAttributes dst_attrs;
-    float           input_width;
-    float           input_height;
-    vec2            scale;
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
+    float              input_width;
+    float              input_height;
+    vec2               scale;
 };
 
 #if defined(DATA_TYPE_FP16)
@@ -75,8 +75,8 @@
 
 void main()
 {
-    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
     vec4[2] tc = clamp_to_border_with_size(transform_nearest(vec2(gl_GlobalInvocationID.x << uint(2), gl_GlobalInvocationID.y), scale), input_width, input_height, float(BORDER_SIZE));
 
@@ -85,7 +85,7 @@
 
     for(int i = 0; i < 4; i++)
     {
-        uint offset_in_bytes = image_offset_in_bytes(src_iter, int(tc[0][i]), int(tc[1][i]));
+        uint offset_in_bytes = tensor3D_offset_in_bytes(src_iter, int(tc[0][i]), int(tc[1][i]), int(gl_GlobalInvocationID.z));
 
         s = LOAD_UNPACK2_HALF(src_ptr, uint(offset_in_bytes >> src_shift));
 
@@ -107,15 +107,15 @@
 
 void main()
 {
-    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
     uvec2 tc = uvec2(gl_GlobalInvocationID.x << uint(2), gl_GlobalInvocationID.y >> uint(1));
 
     mediump vec4 s = vec4(0.0f);
     mediump      vec4[2] d;
 
-    s = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, int(tc[0]), int(tc[1])));
+    s = LOAD_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, int(tc[0]), int(tc[1]), int(gl_GlobalInvocationID.z)));
 
     d[0] = vec4(s.x, s.x, s.y, s.y);
     d[1] = vec4(s.z, s.z, s.w, s.w);

diff --git a/src/core/GLES_COMPUTE/cs_shaders/tensor_shift.cs b/src/core/GLES_COMPUTE/cs_shaders/tensor_shift.cs
new file mode 100644
index 0000000..a0af315
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/tensor_shift.cs

@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers_cs.h"
+
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
+
+/** This kernel performs a shift to move "pad_x" columns to the right.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note The width must be passed at compile time using "#define WIDTH n" e.g. "#define WIDTH 1"
+ *
+ * @param[in,out] src_ptr   Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in]     src_attrs The attributes of the source tensor
+ * @param[in]     pad_x     The padding of the source tensor in x dimension
+ */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src_attrs;
+    uint               pad_x;
+};
+
+#if defined(DATA_TYPE_FP16)
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, restrict);
+
+void main()
+{
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    int              n        = int(pad_x) % 2;
+
+    if(n == 1)
+    {
+        int i = 0;
+        if((WIDTH % 2) == 1)
+        {
+            i = WIDTH + int(pad_x) - 2;
+        }
+        else
+        {
+            vec2 s0_end = LOAD_UNPACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (WIDTH - 2))));
+            vec2 s_end  = vec2(s0_end.y, 0.f);
+            STORE_PACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (WIDTH + int(pad_x) - 1))), s_end);
+            i = WIDTH + int(pad_x) - 3;
+        }
+        for(; i >= (int(pad_x) + 1); i = i - 2)
+        {
+            vec2 s0 = LOAD_UNPACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (i - int(pad_x) - 1))));
+            vec2 s1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (i - int(pad_x) + 1))));
+            vec2 s  = vec2(s0.y, s1.x);
+            STORE_PACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * i)), s);
+        }
+        for(int j = 0; j < (int(pad_x) - 1); j = j + 2)
+        {
+            vec2 s_origin = vec2(0.f);
+            STORE_PACK2_CURRENT_ITEM_HALF(src_ptr, src_iter, s_origin);
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, 4);
+        }
+        vec2 s0_origin = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
+        vec2 s_origin  = vec2(0.f, s0_origin.x);
+        STORE_PACK2_CURRENT_ITEM_HALF(src_ptr, src_iter, s_origin);
+    }
+    else
+    {
+        int i = 0;
+        if((WIDTH % 2) == 0)
+        {
+            i = WIDTH + int(pad_x) - 2;
+        }
+        else
+        {
+            vec2 s0_end = LOAD_UNPACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (WIDTH - 1))));
+            vec2 s_end  = vec2(s0_end.x, 0.f);
+            STORE_PACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (WIDTH + int(pad_x) - 1))), s_end);
+            i = WIDTH + int(pad_x) - 3;
+        }
+        for(; i >= (int(pad_x)); i = i - 2)
+        {
+            vec2 s = LOAD_UNPACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (i - int(pad_x)))));
+            STORE_PACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * i)), s);
+        }
+        for(int j = 0; j < int(pad_x); j = j + 2)
+        {
+            vec2 s = vec2(0.f);
+            STORE_PACK2_CURRENT_ITEM_HALF(src_ptr, src_iter, s);
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, 4);
+        }
+    }
+}
+#elif defined(DATA_TYPE_FP32)
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, restrict);
+
+void main()
+{
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+
+    for(int i = (WIDTH + int(pad_x) - 1); i >= int(pad_x); i--)
+    {
+        float sorigin = LOAD(src_ptr, TENSOR_OFFSET_ADVANCE(src_iter, (i - int(pad_x))));
+        STORE(src_ptr, TENSOR_OFFSET_ADVANCE(src_iter, i), sorigin);
+    }
+    for(int j = 0; j < int(pad_x); j++)
+    {
+        STORE_CURRENT_ITEM(src_ptr, src_iter, 0.f);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, 4);
+    }
+}
+#else /* DATA_TYPE_FP16 */
+#error Data type not supported
+#endif /* DATA_TYPE_FP16 */

diff --git a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
index b8672c6..d7c645d 100644
--- a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -109,16 +109,26 @@
 
     _kernel.use();
 
-    Window slice = window.first_slice_window_3D();
+    _output->set_needs_shifting(true);
+
+    Window slice    = window.first_slice_window_3D();
+    Window slice_in = window.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+
+    if(_input == _output)
+    {
+        slice_in.shift(Window::DimX, -(_input->info()->padding()).left);
+    }
 
     do
     {
         unsigned int idx     = 0;
         unsigned int binding = 1;
-        add_3D_tensor_argument(idx, _input, binding++, slice);
+        add_3D_tensor_argument(idx, _input, binding++, slice_in);
         add_3D_tensor_argument(idx, _output, binding++, slice);
         _kernel.update_shader_params();
         enqueue(*this, slice);
     }
-    while(window.slide_window_slice_3D(slice));
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
 }

diff --git a/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
index caec324..06cf409 100644
--- a/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -135,18 +135,24 @@
 
     _kernel.use();
 
-    Window slice = window.first_slice_window_2D();
+    _output->set_needs_shifting(true);
+
+    Window slice    = window.first_slice_window_3D();
+    Window slice_in = window.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+
     do
     {
         unsigned int idx     = 0;
         unsigned int binding = 1; // SSBO binding starts from 1.
-        add_2D_tensor_argument(idx, _input1, binding++, slice);
-        add_2D_tensor_argument(idx, _input2, binding++, slice);
-        add_2D_tensor_argument(idx, _output, binding++, slice);
+        add_3D_tensor_argument(idx, _input1, binding++, slice_in);
+        add_3D_tensor_argument(idx, _input2, binding++, slice_in);
+        add_3D_tensor_argument(idx, _output, binding++, slice);
 
         _kernel.update_shader_params();
 
         enqueue(*this, slice);
     }
-    while(window.slide_window_slice_2D(slice));
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
 }

diff --git a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
index dee2a55..cd93f69 100644
--- a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,7 +42,7 @@
 }
 
 void GCBatchNormalizationLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma,
-                                                float epsilon)
+                                                float epsilon, ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
@@ -54,7 +54,14 @@
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, mean, var, beta, gamma);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->data_type() != DataType::F32 && input->info()->data_type() != DataType::F16);
+        ARM_COMPUTE_ERROR_ON(act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
+                             && act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
+                             && act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+        ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
+    }
 
     _input   = input;
     _output  = output;
@@ -79,6 +86,13 @@
     build_opts.emplace(("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)));
     build_opts.emplace(("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)));
 
+    if(act_info.enabled())
+    {
+        build_opts.emplace("#define " + string_from_activation_func(act_info.activation()));
+        build_opts.emplace("#define A_VAL " + float_to_string_with_full_precision(act_info.a()));
+        build_opts.emplace("#define B_VAL " + float_to_string_with_full_precision(act_info.b()));
+    }
+
     // Create kernel
     _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("batchnormalization_layer", build_opts));
 
@@ -105,7 +119,10 @@
 
     _kernel.use();
 
-    Window slice = window.first_slice_window_3D();
+    _output->set_needs_shifting(true);
+
+    Window slice    = window.first_slice_window_3D();
+    Window slice_in = window.first_slice_window_3D();
 
     Window vector_slice = window.first_slice_window_1D();
     vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
@@ -116,14 +133,16 @@
     add_1D_tensor_argument(idx, _beta, 5, vector_slice);
     add_1D_tensor_argument(idx, _gamma, 6, vector_slice);
 
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+
     do
     {
         idx = 0;
-        add_3D_tensor_argument(idx, _input, 1, slice);
+        add_3D_tensor_argument(idx, _input, 1, slice_in);
         add_3D_tensor_argument(idx, _output, 2, slice);
 
         _kernel.update_shader_params();
         enqueue(*this, slice);
     }
-    while(window.slide_window_slice_3D(slice));
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
 }

diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
index 7b1848c..36d1b29 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp

@@ -38,7 +38,7 @@
 using namespace arm_compute;
 
 GCDepthConcatenateLayerKernel::GCDepthConcatenateLayerKernel()
-    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0)
+    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
 {
 }
 
@@ -61,8 +61,9 @@
     ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
     ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
 
-    _input  = input;
-    _output = output;
+    _input        = input;
+    _output       = output;
+    _depth_offset = depth_offset;
 
     // Add build options
     std::set<std::string> build_opts;
@@ -76,11 +77,8 @@
     _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
     _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
 
-    const int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2];
-
-    build_opts.emplace("#define OFFSETS_X " + support::cpp11::to_string(_left_right));
-    build_opts.emplace("#define OFFSETS_Y " + support::cpp11::to_string(_top_bottom));
-    build_opts.emplace("#define OFFSETS_Z " + support::cpp11::to_string(offset_to_first_elements_in_bytes));
+    build_opts.emplace("#define OFFSET_X " + support::cpp11::to_string(_left_right));
+    build_opts.emplace("#define OFFSET_Y " + support::cpp11::to_string(_top_bottom));
 
     // Create kernel
     _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("concatenate_depth", build_opts));
@@ -118,17 +116,24 @@
 
     _kernel.use();
 
-    Window slice = window.first_slice_window_3D();
+    _output->set_needs_shifting(true);
+
+    Window slice     = window.first_slice_window_3D();
+    Window slice_in  = window.first_slice_window_3D();
+    Window slice_out = window.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+    slice_out.set(Window::DimZ, Window::Dimension(_depth_offset));
 
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, 1, slice);
-        add_3D_tensor_argument(idx, _output, 2, slice);
+        add_3D_tensor_argument(idx, _input, 1, slice_in);
+        add_3D_tensor_argument(idx, _output, 2, slice_out);
 
         _kernel.update_shader_params();
 
         enqueue(*this, slice);
     }
-    while(window.slide_window_slice_3D(slice));
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
 }

diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
index 28b5bd2..9343268 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -173,16 +173,20 @@
     const int output_padding_bottom = ceil_to_multiple(output_height, num_elems_written_per_iteration_y * _lws[1]) - output_height;
 
     // Calculate input right and bottom border
-    const int input_width    = input->info()->dimension(0);
-    const int input_height   = input->info()->dimension(1);
-    const int padding_right  = ceil_to_multiple(((output_width + output_padding_right) * _conv_stride_x + 2), num_elems_read_per_iteration_x * _lws[0]) - _conv_pad_left - input_width;
-    const int padding_bottom = ceil_to_multiple(((output_height + output_padding_bottom) * _conv_stride_y + 2), num_elems_read_per_iteration_y * _lws[1]) - _conv_pad_top - input_height;
+    const int input_width  = input->info()->dimension(0);
+    const int input_height = input->info()->dimension(1);
+
+    const int input_total_width  = std::max(int(input->info()->padding().left), int(_conv_pad_left)) + input_width + std::max(int(input->info()->padding().right), int(_conv_pad_left));
+    const int input_total_height = std::max(int(input->info()->padding().top), int(_conv_pad_top)) + input_height + std::max(int(input->info()->padding().bottom), int(_conv_pad_top));
+
+    const int input_padding_right  = ceil_to_multiple(input_total_width, num_elems_read_per_iteration_x * _lws[0]) - input_width - _conv_pad_left;
+    const int input_padding_bottom = ceil_to_multiple(input_total_height, num_elems_read_per_iteration_y * _lws[1]) - input_height - _conv_pad_top;
 
     BorderSize border = BorderSize(0, output_padding_right, output_padding_bottom, 0);
 
     Window win = calculate_max_enlarged_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, num_elems_written_per_iteration_z), border);
 
-    AccessWindowStatic input_access(input->info(), -_conv_pad_left, -_conv_pad_top, input_width + padding_right, input_height + padding_bottom);
+    AccessWindowStatic input_access(input->info(), -_conv_pad_left, -_conv_pad_top, input_width + input_padding_right, input_height + input_padding_bottom);
     AccessWindowStatic weights_access = AccessWindowStatic(nullptr, 0, 0, 0, 0);
     AccessWindowStatic bias_access    = AccessWindowStatic(nullptr, 0, 0, 0, 1);
 
@@ -224,6 +228,8 @@
 
     _kernel.use();
 
+    _output->set_needs_shifting(true);
+
     // Create input window and adjust
     Window win_in = window;
     win_in.adjust(Window::DimX, -_conv_pad_left, true);
@@ -246,6 +252,8 @@
         add_1D_tensor_argument(idx, _biases, 4, slice_biases);
     }
 
+    slice_out.shift(Window::DimX, -(_output->info()->padding()).left);
+
     do
     {
         unsigned int idx = 0;

diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index abfe5cc..99b5e7d 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp

@@ -62,8 +62,6 @@
     if(bias != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
-        // FIXME: Bug in framework, workaround it in tests currently.
-        //ARM_COMPUTE_ERROR_ON(bias->info()->dimension(0) != weights->info()->dimension(3));
         ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
     }
 
@@ -84,6 +82,7 @@
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_ON(!conv_info.padding_is_symmetric());
 
     _conv_stride_x = std::get<0>(conv_info.stride());
     _conv_stride_y = std::get<1>(conv_info.stride());
@@ -314,12 +313,20 @@
     const int output_padding_bottom = ceil_to_multiple(output_height, num_elems_written_per_iteration_y * _lws[1]) - output_height;
 
     // Calculate input right and bottom border
-    const int input_width    = input->info()->dimension(0);
-    const int input_height   = input->info()->dimension(1);
-    const int upper_bound_w  = ceil_to_multiple(((output_width + output_padding_right) * _conv_stride_x + (kernel_size - 1)), num_elems_read_per_iteration_x * _lws[0]) - _conv_pad_x - input_width;
-    const int upper_bound_h  = ceil_to_multiple(((output_height + output_padding_bottom) * _conv_stride_y + (kernel_size - 1)), num_elems_read_per_iteration_y * _lws[1]) - _conv_pad_y - input_height;
-    const int padding_right  = std::max(upper_bound_w, _conv_pad_x);
-    const int padding_bottom = std::max(upper_bound_h, _conv_pad_y);
+    const int input_width        = input->info()->dimension(0);
+    const int input_height       = input->info()->dimension(1);
+    const int input_total_width  = std::max(int(input->info()->padding().left), int(_conv_pad_x)) + input_width + std::max(int(input->info()->padding().right), int(_conv_pad_x));
+    const int input_total_height = std::max(int(input->info()->padding().top), int(_conv_pad_y)) + input_height + std::max(int(input->info()->padding().bottom), int(_conv_pad_y));
+    const int padding_right1     = ceil_to_multiple(input_total_width, num_elems_read_per_iteration_x * _lws[0]) - input_width - _conv_pad_x;
+    const int padding_bottom1    = ceil_to_multiple(input_total_height, num_elems_read_per_iteration_y * _lws[1]) - input_height - _conv_pad_y;
+
+    const int upper_bound_w   = ceil_to_multiple(((output_width + output_padding_right) * _conv_stride_x + (kernel_size - 1)), num_elems_read_per_iteration_x * _lws[0]) - _conv_pad_x - input_width;
+    const int upper_bound_h   = ceil_to_multiple(((output_height + output_padding_bottom) * _conv_stride_y + (kernel_size - 1)), num_elems_read_per_iteration_y * _lws[1]) - _conv_pad_y - input_height;
+    const int padding_right2  = std::max(upper_bound_w, _conv_pad_x);
+    const int padding_bottom2 = std::max(upper_bound_h, _conv_pad_y);
+
+    const int padding_right  = std::max(padding_right1, padding_right2);
+    const int padding_bottom = std::max(padding_bottom1, padding_bottom2);
 
     BorderSize border = BorderSize(0, output_padding_right, output_padding_bottom, 0);
 
@@ -382,6 +389,8 @@
 
     _kernel.use();
 
+    _output->set_needs_shifting(true);
+
     // Get initial windows
     Window slice  = window.first_slice_window_3D();
     Window win_in = window;
@@ -403,6 +412,8 @@
         add_1D_tensor_argument(idx1, _bias, 4, slice_bias);
     }
 
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+
     do
     {
         unsigned int idx = 0;

diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
index bc9c7eb..fac2902 100644
--- a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -89,6 +89,8 @@
 
     _kernel.use();
 
+    _output->set_needs_shifting(true);
+
     Window slice = window.first_slice_window_3D();
 
     Window slice_in;
@@ -100,15 +102,19 @@
     add_1D_tensor_argument(idx, _mean, 3, slice_in);
     add_1D_tensor_argument(idx, _sd, 4, slice_in);
 
+    slice_in = window.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+
     do
     {
         idx = 0;
-        add_3D_tensor_argument(idx, _input, 1, slice);
+        add_3D_tensor_argument(idx, _input, 1, slice_in);
         add_3D_tensor_argument(idx, _output, 2, slice);
 
         _kernel.update_shader_params();
 
         enqueue(*this, slice);
     }
-    while(window.slide_window_slice_3D(slice));
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
 }

diff --git a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
index 6451db7..3a0944c 100644
--- a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,14 +60,16 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type() == PoolingType::L2),
                                     "Unsupported combination of parameters!");
+    ARM_COMPUTE_RETURN_ERROR_ON(!pool_info.pad_stride_info().padding_is_symmetric());
 
     const bool         is_global_pooling = pool_info.is_global_pooling();
-    const unsigned int pool_size         = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size();
+    const unsigned int pool_size         = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size().width;
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()),
                                     "Global pooling is supported only with rectangular inputs!");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_global_pooling && ((pool_info.pad_stride_info().pad().first >= pool_size) || (pool_info.pad_stride_info().pad().second >= pool_size)),
                                     "Invalid pool size and pool pad combination!");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_size().width != pool_info.pool_size().height, "Invalid Pool size, width not equal to height!");
 
     // Checks performed when output is configured
     if(output->total_size() != 0)
@@ -97,7 +99,7 @@
     int                 pool_stride_y   = 0;
     unsigned int        pooled_w        = 0;
     unsigned int        pooled_h        = 0;
-    int                 pool_size       = pool_info.pool_size();
+    int                 pool_size       = pool_info.pool_size().width;
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
     std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
@@ -196,11 +198,14 @@
         const int output_height         = output->dimension(1);
         const int output_padding_right  = ceil_to_multiple(output_width, num_elems_processed_per_iteration) - output_width;
         const int output_padding_bottom = ceil_to_multiple(output_height, 1) - output_height;
-        const int input_padding_right   = ceil_to_multiple(input_width + 2 * border_size.right, num_elems_processed_per_iteration) - (input_width + 2 * border_size.right);
-        const int input_padding_bottom  = ceil_to_multiple(input_height + 2 * border_size.bottom, 1) - (input_height + 2 * border_size.bottom);
+
+        const int input_total_width    = std::max(int(input->padding().left), int(pool_pad_x)) + input_width + std::max(int(input->padding().right), int(pool_pad_x));
+        const int input_padding_right  = ceil_to_multiple(input_total_width, num_elems_processed_per_iteration) - input_width - pool_pad_x;
+        const int input_total_height   = std::max(int(input->padding().top), int(pool_pad_y)) + input_height + std::max(int(input->padding().bottom), int(pool_pad_y));
+        const int input_padding_bottom = input_total_height - input_height - pool_pad_y;
 
         // Configure kernel window
-        AccessWindowStatic input_access(input, -pool_pad_x, -pool_pad_y, input_width + border_size.right + input_padding_right, input_height + border_size.bottom + input_padding_bottom);
+        AccessWindowStatic input_access(input, -pool_pad_x, -pool_pad_y, input_width + input_padding_right, input_height + input_padding_bottom);
         AccessWindowStatic output_access(output, 0, 0, output_width + output_padding_right, output_height + output_padding_bottom);
         bool               window_changed = update_window_and_padding(win, input_access, output_access);
         output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
@@ -229,7 +234,7 @@
     unsigned int        pooled_w        = 0;
     unsigned int        pooled_h        = 0;
     const PoolingType   pool_type       = pool_info.pool_type();
-    int                 pool_size       = pool_info.pool_size();
+    int                 pool_size       = pool_info.pool_size().width;
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
     const bool          exclude_padding = pool_info.exclude_padding();
     std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
@@ -338,13 +343,19 @@
 
     _kernel.use();
 
+    _output->set_needs_shifting(true);
+
     Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
-    Window slice            = window_collapsed.first_slice_window_3D();
+
+    Window slice         = window_collapsed.first_slice_window_3D();
+    Window slice_in_orig = window_collapsed.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
 
     do
     {
         // Upsample input by pool size
-        Window in_slice(slice); // NOLINT
+        Window in_slice(slice_in_orig); // NOLINT
         in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x * _num_elems_processed_per_iteration));
         in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y));
 
@@ -356,5 +367,5 @@
         _kernel.update_shader_params();
         enqueue(*this, slice);
     }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    while(window_collapsed.slide_window_slice_3D(slice) && window_collapsed.slide_window_slice_3D(slice_in_orig));
 }

diff --git a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
index f307cfb..46d7ff9 100644
--- a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -128,9 +128,34 @@
 
     IGCKernel::configure(win);
 
-    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the tensor parameters
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the tensor parameters
     _kernel.set_argument<float>(idx++, static_cast<float>(input->info()->dimension(0)));
     _kernel.set_argument<float>(idx++, static_cast<float>(input->info()->dimension(1)));
     _kernel.set_argument<float>(idx++, wr);
     _kernel.set_argument<float>(idx++, hr);
 }
+
+void GCScaleKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    _kernel.use();
+
+    _output->set_needs_shifting(true);
+
+    Window slice    = window.first_slice_window_3D();
+    Window slice_in = window.first_slice_window_3D();
+
+    slice.shift(Window::DimX, -(_output->info()->padding()).left);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, 1, slice_in);
+        add_3D_tensor_argument(idx, _output, 2, slice);
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
new file mode 100644
index 0000000..21946b7
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp

@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+using namespace arm_compute::gles_compute;
+
+GCTensorShiftKernel::GCTensorShiftKernel()
+    : _input(nullptr), _lws(gles::NDRange(1U, 1U, 1U)), _left_padding(0)
+{
+}
+
+void GCTensorShiftKernel::configure(IGCTensor *input)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+
+    _input = input;
+
+    std::set<std::string> options;
+    options.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0]));
+    options.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1]));
+    options.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2]));
+    options.emplace("#define WIDTH " + support::cpp11::to_string(input->info()->dimension(0)));
+
+    std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    options.emplace(("#define " + dt_name));
+
+    unsigned int num_elems_written_per_iteration_x = input->info()->dimension(0) + input->info()->padding().left + input->info()->padding().right;
+
+    std::stringstream kernel_name;
+    kernel_name << "tensorshift";
+
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name.str(), options));
+
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_x));
+    win.use_tensor_dimensions(input->info()->tensor_shape(), Window::DimY);
+    win.use_tensor_dimensions(input->info()->tensor_shape(), Window::DimZ);
+
+    _left_padding = _input->info()->padding().left;
+
+    IGCKernel::configure(win);
+}
+
+void GCTensorShiftKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    if(int(_left_padding) == 0 || !_input->needs_shifting())
+    {
+        return;
+    }
+
+    _kernel.use();
+
+    // Get initial windows
+    Window slice = window.first_slice_window_3D();
+    slice.shift(Window::DimX, -(_input->info()->padding()).left);
+
+    do
+    {
+        unsigned int idx = 0;
+
+        add_3D_tensor_argument(idx, _input, 1, slice);
+
+        _kernel.set_argument(idx++, static_cast<unsigned int>(_left_padding));
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice, _lws);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp
index 693d851..c73f4e7 100644
--- a/src/core/IAccessWindow.cpp
+++ b/src/core/IAccessWindow.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -122,9 +122,10 @@
         if(min_y < front_pad_y_available)
         {
             // Not enough padding available, need to shrink the window
-            const int start = adjust_up(min_y, front_pad_y_available, window.y().step() * _scale_y) - _y;
+            int start = adjust_up(min_y, front_pad_y_available, window.y().step() * _scale_y) - _y;
+            start     = std::min<int>(start / _scale_y, window.y().end());
 
-            window.set(1, Window::Dimension(start / _scale_y, window.y().end(), window.y().step()));
+            window.set(1, Window::Dimension(start, window.y().end(), window.y().step()));
             window_modified = true;
         }
 
@@ -143,8 +144,10 @@
         if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
         {
             // Not enough padding available, need to shrink the window
-            const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) + window.y().step() * _scale_y - _y - _height;
-            window.set(1, Window::Dimension(window.y().start(), end / _scale_y, window.y().step()));
+            int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) + window.y().step() * _scale_y - _y - _height;
+            end     = std::max<int>(window.y().start(), end / _scale_y);
+
+            window.set(1, Window::Dimension(window.y().start(), end, window.y().step()));
             window_modified = true;
         }
     }
@@ -164,8 +167,10 @@
         if(min_x < front_pad_x_available)
         {
             // Not enough padding available, need to shrink the window
-            const int start = adjust_up(min_x, front_pad_x_available, window.x().step() * _scale_x) - _x;
-            window.set(0, Window::Dimension(start / _scale_x, window.x().end(), window.x().step()));
+            int start = adjust_up(min_x, front_pad_x_available, window.x().step() * _scale_x) - _x;
+            start     = std::min<int>(start / _scale_x, window.x().end());
+
+            window.set(0, Window::Dimension(start, window.x().end(), window.x().step()));
             window_modified = true;
         }
 
@@ -181,8 +186,10 @@
         if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
         {
             // Not enough padding available, need to shrink the window
-            const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) + window.x().step() * _scale_x - _x - _width;
-            window.set(0, Window::Dimension(window.x().start(), end / _scale_x, window.x().step()));
+            int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) + window.x().step() * _scale_x - _x - _width;
+            end     = std::max<int>(window.x().start(), end / _scale_x);
+
+            window.set(0, Window::Dimension(window.x().start(), end, window.x().step()));
             window_modified = true;
         }
     }
@@ -192,7 +199,7 @@
     return window_modified;
 }
 
-bool AccessWindowRectangle::update_padding_if_needed(const Window &window) const
+bool AccessWindowRectangle::update_padding_if_needed(const Window &window)
 {
     // Only update the padding if the tensor allows it
     if(_info == nullptr || !_info->is_resizable())
@@ -200,8 +207,8 @@
         return false;
     }
 
-    ARM_COMPUTE_ERROR_ON(window.x().step() * _scale_x == 0);
-    ARM_COMPUTE_ERROR_ON(window.y().step() * _scale_y == 0);
+    ARM_COMPUTE_ERROR_ON(_scale_x == 0);
+    ARM_COMPUTE_ERROR_ON(_scale_y == 0);
 
     const int min_x = window.x().start() * _scale_x + _x;
     const int max_x = (window.x().end() - window.x().step()) * _scale_x + _x + _width;

diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index 9670b77..a6dbfe6 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -109,8 +109,9 @@
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr));
 
-    ARM_COMPUTE_ERROR_ON_MSG((input->info()->data_type() == DataType::QASYMM8) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU),
-                             "For QASYMM8 only lower/upper bounded relu is supported");
+    ARM_COMPUTE_ERROR_ON_MSG((input->info()->data_type() == DataType::QASYMM8) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                             && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU),
+                             "For QASYMM8 only relu and lower/upper bounded relu are supported");
 
     // Activation functions : FP32
     static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 =
@@ -179,6 +180,7 @@
     static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qasymm8 =
     {
         { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, qasymm8_t> },
+        { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qasymm8_t> },
     };
 
     switch(input->info()->data_type())
@@ -359,8 +361,16 @@
         const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
         const auto output_ptr = reinterpret_cast<float *>(output.ptr());
 
-        const float32x4x4_t in  = vld4q_f32(input_ptr);
-        float32x4x4_t       tmp = { {} };
+        const float32x4x4_t in =
+        {
+            {
+                vld1q_f32(input_ptr),
+                vld1q_f32(input_ptr + 4),
+                vld1q_f32(input_ptr + 8),
+                vld1q_f32(input_ptr + 12)
+            }
+        };
+        float32x4x4_t tmp = { {} };
 
         switch(F)
         {
@@ -489,7 +499,10 @@
                 break;
         }
 
-        vst4q_f32(output_ptr, tmp);
+        vst1q_f32(output_ptr, tmp.val[0]);
+        vst1q_f32(output_ptr + 4, tmp.val[1]);
+        vst1q_f32(output_ptr + 8, tmp.val[2]);
+        vst1q_f32(output_ptr + 12, tmp.val[3]);
     },
     input, output);
 }
@@ -561,12 +574,14 @@
 template <ActivationLayerInfo::ActivationFunction F, typename T>
 typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
 {
-    Iterator               input(_input, window);
-    Iterator               output(_output, window);
-    const QuantizationInfo qi_in  = _input->info()->quantization_info();
-    const QuantizationInfo qi_out = _output->info()->quantization_info();
-    const qasymm8x16_t     a      = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset));
-    const qasymm8x16_t     b      = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset));
+    Iterator                  input(_input, window);
+    Iterator                  output(_output, window);
+    const QuantizationInfo    qi_in   = _input->info()->quantization_info();
+    const QuantizationInfo    qi_out  = _output->info()->quantization_info();
+    const qasymm8x16_t        a       = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset));
+    const qasymm8x16_t        b       = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset));
+    static const qasymm8x16_t CONST_0 = vdupq_n_u8(sqcvt_qasymm8_f32(0.f, qi_in.scale, qi_in.offset));
+
     // Initialise scale/offset for re-quantization
     float       s  = qi_in.scale / qi_out.scale;
     float       o  = -qi_in.offset * s + qi_out.offset;
@@ -589,6 +604,12 @@
                 // Re-quantize to new output space
                 tmp = vmlaq_qasymm8(tmp, vs, vo);
                 break;
+            case ActivationFunction::RELU:
+                // Perform activation
+                tmp = vmaxq_u8(CONST_0, in);
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8(tmp, vs, vo);
+                break;
             default:
                 ARM_COMPUTE_ERROR("Function not implemented");
                 break;

diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index 8a98cf7..a487090 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,10 +46,12 @@
 
 namespace
 {
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
 void add_wrap_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -64,8 +66,8 @@
 
 void add_saturate_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -80,8 +82,8 @@
 
 void add_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -93,8 +95,8 @@
 
 void add_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -163,8 +165,8 @@
 void add_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -186,8 +188,8 @@
 
 void add_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -202,8 +204,8 @@
 
 void add_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -218,8 +220,8 @@
 
 void add_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -234,8 +236,8 @@
 
 void add_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -257,8 +259,8 @@
 
 void add_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -292,8 +294,8 @@
 
 void add_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -325,8 +327,8 @@
 
 void add_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -356,50 +358,84 @@
     input1, input2, output);
 }
 
-inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
 {
     ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
 
-    if(is_data_type_fixed_point(input1->data_type()) || is_data_type_fixed_point(input2->data_type()) || is_data_type_fixed_point(output->data_type()))
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+
+    const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+    if(is_data_type_fixed_point(input1.data_type()) || is_data_type_fixed_point(input2.data_type()))
     {
-        // Check that all data types are the same and all fixed-point positions are the same
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(&input1, &input2);
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        !(input1->data_type() == DataType::QS8 && input2->data_type() == DataType::QS8 && output->data_type() == DataType::QS8)
-        && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::U8)
-        && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)
-        && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16)
-        && !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)
-        && !(input1->data_type() == DataType::QS16 && input2->data_type() == DataType::QS16 && output->data_type() == DataType::QS16)
-        && !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16)
-        && !(input1->data_type() == DataType::F32 && input2->data_type() == DataType::F32 && output->data_type() == DataType::F32)
-        && !(input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16 && output->data_type() == DataType::F16),
-        "You called addition with the wrong image formats");
+    // Validate in case of configured output
+    if(output.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            !(input1.data_type() == DataType::QS8 && input2.data_type() == DataType::QS8 && output.data_type() == DataType::QS8)
+            && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
+            && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
+            && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
+            && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
+            && !(input1.data_type() == DataType::QS16 && input2.data_type() == DataType::QS16 && output.data_type() == DataType::QS16)
+            && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
+            && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
+            && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16),
+            "You called addition with the wrong image formats");
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+                                        "Wrong shape for output");
+
+        if(is_data_type_fixed_point(input1.data_type()) || is_data_type_fixed_point(output.data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(&input1, &output);
+        }
+    }
 
     return Status{};
 }
 
-inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
 {
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
+    const TensorShape &out_shape    = broadcast_pair.first;
+    const ValidRegion &valid_region = broadcast_pair.second;
 
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    // Auto initialize output if not initialized
+    {
+        set_shape_if_empty(output, out_shape);
 
-    bool window_changed = update_window_and_padding(win,
-                                                    AccessWindowHorizontal(input1, 0, num_elems_processed_per_iteration),
-                                                    AccessWindowHorizontal(input2, 0, num_elems_processed_per_iteration),
-                                                    output_access);
+        if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
+        {
+            set_format_if_unknown(output, Format::S16);
+        }
+        else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
+        {
+            set_format_if_unknown(output, Format::F16);
+        }
+        else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
+        {
+            set_format_if_unknown(output, Format::F32);
+        }
+    }
 
-    ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
-                                                       input2->valid_region());
+    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+    Window win_input1 = win.broadcast_if_dimension_le_one(input1);
+    Window win_input2 = win.broadcast_if_dimension_le_one(input2);
+
+    AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win_input1, input1_access)
+                          || update_window_and_padding(win_input2, input2_access)
+                          || update_window_and_padding(win, output_access);
 
     output_access.set_valid_region(win, valid_region);
 
@@ -416,26 +452,11 @@
 void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
 
-    // Auto initialize output if not initialized
-    {
-        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
-
-        if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
-        {
-            set_format_if_unknown(*output->info(), Format::S16);
-        }
-        else if(input1->info()->data_type() == DataType::F16 || input2->info()->data_type() == DataType::F16)
-        {
-            set_format_if_unknown(*output->info(), Format::F16);
-        }
-        else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
-        {
-            set_format_if_unknown(*output->info(), Format::F32);
-        }
-    }
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), policy));
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     static std::map<std::string, AddFunction *> map_function =
     {
@@ -476,16 +497,15 @@
         _func = it->second;
     }
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
 
 Status NEArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
 
     return Status{};
 }
@@ -499,3 +519,10 @@
 
     (*_func)(_input1, _input2, _output, window);
 }
+
+BorderSize NEArithmeticAdditionKernel::border_size() const
+{
+    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+    return BorderSize(0, border, 0, 0);
+}

diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index f5144c6..1f730a2 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,19 +26,34 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include <map>
+
 using namespace arm_compute;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var,
+                   const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_UNUSED(epsilon);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16,
+                                                         DataType::F32);
+
+    if(act_info.enabled())
+    {
+        ActivationLayerInfo::ActivationFunction act = act_info.activation();
+        ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
+                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+        ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
+    }
 
     if(nullptr != output)
     {
@@ -67,28 +82,32 @@
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
+} //namespace
 
-void batch_normalization_q8(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
+template <bool fused_activation>
+void NEBatchNormalizationLayerKernel::batch_normalization_qs8(const Window &window)
 {
-    Iterator input(in, window);
-    Iterator output(out, window);
+    static_assert(!fused_activation, "Activation is not supported for QS8");
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
 
     // Hold information about the current feature map we are iterating.
     // Only compute denominator and NEON vectors once per feature map.
     int slice = -1;
 
-    const int  fixed_point_position = in->info()->fixed_point_position();
-    const auto input_mean           = reinterpret_cast<const qint8_t *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var            = reinterpret_cast<const qint8_t *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma          = reinterpret_cast<const qint8_t *>(gamma->ptr_to_element(Coordinates(0, 0)));
-    const auto input_beta           = reinterpret_cast<const qint8_t *>(beta->ptr_to_element(Coordinates(0, 0)));
+    const int  fixed_point_position = _input->info()->fixed_point_position();
+    const auto input_mean           = reinterpret_cast<const qint8_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var            = reinterpret_cast<const qint8_t *>(_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma          = reinterpret_cast<const qint8_t *>(_gamma->ptr_to_element(Coordinates(0, 0)));
+    const auto input_beta           = reinterpret_cast<const qint8_t *>(_beta->ptr_to_element(Coordinates(0, 0)));
 
     qint8x16_t       mean_vec    = vdupq_n_qs8(0);
     qint8x16_t       var_vec     = vdupq_n_qs8(0);
     qint8x16_t       gamma_vec   = vdupq_n_qs8(0);
     qint8x16_t       beta_vec    = vdupq_n_qs8(0);
     qint8x16_t       denominator = vdupq_n_qs8(0);
-    const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(epsilon, fixed_point_position));
+    const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(_epsilon, fixed_point_position));
     execute_window_loop(window, [&](const Coordinates & id)
     {
         if(slice != id.z())
@@ -112,27 +131,30 @@
     input, output);
 }
 
-void batch_normalization_q16(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
+template <bool fused_activation>
+void NEBatchNormalizationLayerKernel::batch_normalization_qs16(const Window &window)
 {
-    Iterator input(in, window);
-    Iterator output(out, window);
+    static_assert(!fused_activation, "Activation is not supported for QS16");
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
 
     // Hold information about the current feature map we are iterating.
     // Only compute denominator and NEON vectors once per feature map.
     int slice = -1;
 
-    const int  fixed_point_position = in->info()->fixed_point_position();
-    const auto input_mean           = reinterpret_cast<const qint16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var            = reinterpret_cast<const qint16_t *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma          = reinterpret_cast<const qint16_t *>(gamma->ptr_to_element(Coordinates(0, 0)));
-    const auto input_beta           = reinterpret_cast<const qint16_t *>(beta->ptr_to_element(Coordinates(0, 0)));
+    const int  fixed_point_position = _input->info()->fixed_point_position();
+    const auto input_mean           = reinterpret_cast<const qint16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var            = reinterpret_cast<const qint16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma          = reinterpret_cast<const qint16_t *>(_gamma->ptr_to_element(Coordinates(0, 0)));
+    const auto input_beta           = reinterpret_cast<const qint16_t *>(_beta->ptr_to_element(Coordinates(0, 0)));
 
     qint16x8_t       mean_vec    = vdupq_n_qs16(0);
     qint16x8_t       var_vec     = vdupq_n_qs16(0);
     qint16x8_t       gamma_vec   = vdupq_n_qs16(0);
     qint16x8_t       beta_vec    = vdupq_n_qs16(0);
     qint16x8_t       denominator = vdupq_n_qs16(0);
-    const qint16x8_t epsilon_vec = vdupq_n_qs16(sqcvt_qs16_f32(epsilon, fixed_point_position));
+    const qint16x8_t epsilon_vec = vdupq_n_qs16(sqcvt_qs16_f32(_epsilon, fixed_point_position));
     execute_window_loop(window, [&](const Coordinates & id)
     {
         if(slice != id.z())
@@ -156,70 +178,31 @@
     input, output);
 }
 
-void batch_normalization_fp32(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
+template <bool fused_activation>
+void NEBatchNormalizationLayerKernel::batch_normalization_fp16(const Window &window)
 {
-    Iterator input(in, window);
-    Iterator output(out, window);
+    static_assert(!fused_activation, "Activation is not supported for QS8");
 
-    // Hold information about the current feature map we are iterating.
-    // Only compute denominator and NEON vectors once per feature map.
-    int slice = -1;
-
-    const auto input_mean  = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0)));
-    const auto input_beta  = reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0)));
-
-    float32x4_t       mean_vec    = vdupq_n_f32(0.0);
-    float32x4_t       var_vec     = vdupq_n_f32(0.0);
-    float32x4_t       gamma_vec   = vdupq_n_f32(0.0);
-    float32x4_t       beta_vec    = vdupq_n_f32(0.0);
-    float32x4_t       denominator = vdupq_n_f32(0.0);
-    const float32x4_t epsilon_vec = vdupq_n_f32(epsilon);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        if(slice != id.z())
-        {
-            // Conctruct vectors
-            mean_vec  = vdupq_n_f32(*(input_mean + id.z()));
-            var_vec   = vdupq_n_f32(*(input_var + id.z()));
-            gamma_vec = vdupq_n_f32(*(input_gamma + id.z()));
-            beta_vec  = vdupq_n_f32(*(input_beta + id.z()));
-
-            // Calculate denominator
-            denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
-            slice       = id.z();
-        }
-
-        // Calculate x bar and store results
-        const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), mean_vec);
-        const float32x4_t x_bar     = vmulq_f32(numerator, denominator);
-        vst1q_f32(reinterpret_cast<float *>(output.ptr()), vmlaq_f32(beta_vec, x_bar, gamma_vec));
-    },
-    input, output);
-}
-
+    ARM_COMPUTE_UNUSED(window);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void batch_normalization_fp16(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
-{
-    Iterator input(in, window);
-    Iterator output(out, window);
+    Iterator input(_input, window);
+    Iterator output(_output, window);
 
     // Hold information about the current feature map we are iterating.
     // Only compute denominator and NEON vectors once per feature map.
     int slice = -1;
 
-    const auto input_mean  = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0)));
-    const auto input_beta  = reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0)));
+    const auto input_mean  = reinterpret_cast<const float16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const float16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = reinterpret_cast<const float16_t *>(_gamma->ptr_to_element(Coordinates(0, 0)));
+    const auto input_beta  = reinterpret_cast<const float16_t *>(_beta->ptr_to_element(Coordinates(0, 0)));
 
     float16x8_t       mean_vec    = vdupq_n_f16(0.0);
     float16x8_t       var_vec     = vdupq_n_f16(0.0);
     float16x8_t       gamma_vec   = vdupq_n_f16(0.0);
     float16x8_t       beta_vec    = vdupq_n_f16(0.0);
     float16x8_t       denominator = vdupq_n_f16(0.0);
-    const float16x8_t epsilon_vec = vdupq_n_f16(epsilon);
+    const float16x8_t epsilon_vec = vdupq_n_f16(_epsilon);
     execute_window_loop(window, [&](const Coordinates & id)
     {
         if(slice != id.z())
@@ -241,16 +224,116 @@
         vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec)));
     },
     input, output);
-}
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace
+}
+
+template <bool fused_activation, typename F>
+void NEBatchNormalizationLayerKernel::batch_normalization_fp32(const Window &window)
+{
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    F activation_functor(_act_info);
+
+    // Hold information about the current feature map we are iterating.
+    // Only compute denominator and NEON vectors once per feature map.
+    int slice = -1;
+
+    const auto input_mean  = reinterpret_cast<const float *>(_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const float *>(_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = reinterpret_cast<const float *>(_gamma->ptr_to_element(Coordinates(0, 0)));
+    const auto input_beta  = reinterpret_cast<const float *>(_beta->ptr_to_element(Coordinates(0, 0)));
+
+    float32x4_t       mean_vec    = vdupq_n_f32(0.0);
+    float32x4_t       var_vec     = vdupq_n_f32(0.0);
+    float32x4_t       gamma_vec   = vdupq_n_f32(0.0);
+    float32x4_t       beta_vec    = vdupq_n_f32(0.0);
+    float32x4_t       denominator = vdupq_n_f32(0.0);
+    const float32x4_t epsilon_vec = vdupq_n_f32(_epsilon);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        if(slice != id.z())
+        {
+            // Conctruct vectors
+            mean_vec  = vdupq_n_f32(*(input_mean + id.z()));
+            var_vec   = vdupq_n_f32(*(input_var + id.z()));
+            gamma_vec = vdupq_n_f32(*(input_gamma + id.z()));
+            beta_vec  = vdupq_n_f32(*(input_beta + id.z()));
+
+            // Calculate denominator
+            denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
+            slice       = id.z();
+        }
+
+        // Calculate x bar
+        const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), mean_vec);
+        const float32x4_t x_bar     = vmulq_f32(numerator, denominator);
+        float32x4_t       res       = vmlaq_f32(beta_vec, x_bar, gamma_vec);
+
+        // Perform fused activation
+        if(fused_activation)
+        {
+            activation_functor(res);
+        }
+
+        // Store results
+        vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
+    },
+    input, output);
+}
+
+void NEBatchNormalizationLayerKernel::configure_non_fused()
+{
+    switch(_input->info()->data_type())
+    {
+        case DataType::QS8:
+            _func = &NEBatchNormalizationLayerKernel::batch_normalization_qs8<false>;
+            break;
+        case DataType::QS16:
+            _func = &NEBatchNormalizationLayerKernel::batch_normalization_qs16<false>;
+            break;
+        case DataType::F16:
+            _func = &NEBatchNormalizationLayerKernel::batch_normalization_fp16<false>;
+            break;
+        case DataType::F32:
+            _func = &NEBatchNormalizationLayerKernel::batch_normalization_fp32<false, ::detail::dummy<float, 4>>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+}
+
+void NEBatchNormalizationLayerKernel::configure_fused()
+{
+    // Fused Batched Normalization with activation functions : FP32
+    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32 =
+    {
+        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32<true, ::detail::relu<float, 4>> },
+        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32<true, ::detail::brelu<float, 4>> },
+        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32<true, ::detail::lubrelu<float, 4>> }
+    };
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::F32:
+            _func = bn_fused_map_f32[_act_info.activation()];
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+}
 
 NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(), _act_info()
 {
 }
 
-void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
+                                                const ITensor *mean, const ITensor *var,
+                                                const ITensor *beta, const ITensor *gamma,
+                                                float epsilon, ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var, beta, gamma);
 
@@ -264,40 +347,33 @@
         output_info = output->info();
     }
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output_info, mean->info(), var->info(), beta->info(), gamma->info(), epsilon));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output_info,
+                                                  mean->info(), var->info(),
+                                                  beta->info(), gamma->info(),
+                                                  epsilon, act_info));
 
-    _input   = input;
-    _output  = input;
-    _mean    = mean;
-    _var     = var;
-    _gamma   = gamma;
-    _beta    = beta;
-    _epsilon = epsilon;
+    _input    = input;
+    _output   = input;
+    _mean     = mean;
+    _var      = var;
+    _gamma    = gamma;
+    _beta     = beta;
+    _epsilon  = epsilon;
+    _act_info = act_info;
 
     if(output != nullptr)
     {
         _output = output;
     }
 
-    switch(input->info()->data_type())
+    // Configure activation function to run
+    if(_act_info.enabled())
     {
-        case DataType::QS8:
-            _func = &batch_normalization_q8;
-            break;
-        case DataType::QS16:
-            _func = &batch_normalization_q16;
-            break;
-        case DataType::F32:
-            _func = &batch_normalization_fp32;
-            break;
-        case DataType::F16:
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            _func = &batch_normalization_fp16;
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            ARM_COMPUTE_ERROR("Element size not supported");
-            break;
+        configure_fused();
+    }
+    else
+    {
+        configure_non_fused();
     }
 
     // Configure kernel window
@@ -306,11 +382,12 @@
     INEKernel::configure(win_config.second);
 }
 
-Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta,
-                                                 const ITensorInfo *gamma,
-                                                 float              epsilon)
+Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                                 const ITensorInfo *mean, const ITensorInfo *var,
+                                                 const ITensorInfo *beta, const ITensorInfo *gamma,
+                                                 float epsilon, ActivationLayerInfo act_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output ? output->clone().get() : nullptr).first);
 
     return Status{};
@@ -323,5 +400,5 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(_input, _output, _mean, _var, _beta, _gamma, _epsilon, window);
+    (this->*_func)(window);
 }

diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index 3888300..c1e3e1f 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
@@ -32,6 +33,7 @@
 #include <cstdint>
 
 using namespace arm_compute;
+using namespace arm_compute::wrapper;
 
 namespace arm_compute
 {
@@ -40,12 +42,14 @@
 
 namespace
 {
-inline void bitwise_and_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+template <typename T, int S>
+inline void bitwise_and(const T *__restrict input1, const T *__restrict input2, T *__restrict output)
 {
-    const uint8x16_t val1 = vld1q_u8(input1);
-    const uint8x16_t val2 = vld1q_u8(input2);
+    using type      = typename wrapper::traits::neon_vector<T, S>::type;
+    const type val1 = vloadq(static_cast<const T *>(input1));
+    const type val2 = vloadq(static_cast<const T *>(input2));
 
-    vst1q_u8(output, vandq_u8(val1, val2));
+    vstore(static_cast<T *>(output), vand(val1, val2));
 }
 } // namespace
 
@@ -104,7 +108,7 @@
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        bitwise_and_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
+        bitwise_and<uint8_t, 16>(input1.ptr(), input2.ptr(), output.ptr());
     },
     input1, input2, output);
 }

diff --git a/src/core/NEON/kernels/NEChannelExtractKernel.cpp b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
index bac2471..98b2f28 100644
--- a/src/core/NEON/kernels/NEChannelExtractKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelExtractKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,106 +56,68 @@
 
     set_format_if_unknown(*output->info(), Format::U8);
 
+    // Check if input tensor has a valid format
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422);
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
 
-    unsigned int num_elems_processed_per_iteration = 8;
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
 
-    // Check format and channel
-    const Format       format      = input->info()->format();
-    const unsigned int subsampling = (format == Format::YUYV422 || format == Format::UYVY422) && channel != Channel::Y ? 2 : 1;
-    TensorShape        output_shape;
+    // Check if channel is valid for given format
+    const Format format = input->info()->format();
+    ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel);
 
-    switch(format)
+    unsigned int subsampling = 1;
+
+    if(format == Format::YUYV422 || format == Format::UYVY422)
     {
-        case Format::RGB888:
-        case Format::RGBA8888:
-            num_elems_processed_per_iteration = 16;
-            output_shape                      = input->info()->tensor_shape();
+        // Check if the width of the tensor shape is even for formats with subsampled channels (UYVY422 and YUYV422)
+        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(format, input);
 
-            if(format == Format::RGB888)
-            {
-                _func = &NEChannelExtractKernel::extract_1C_from_3C_img;
-            }
-            else if(format == Format::RGBA8888)
-            {
-                _func = &NEChannelExtractKernel::extract_1C_from_4C_img;
-            }
-
-            switch(channel)
-            {
-                case Channel::R:
-                    _lut_index = 0;
-                    break;
-                case Channel::G:
-                    _lut_index = 1;
-                    break;
-                case Channel::B:
-                    _lut_index = 2;
-                    break;
-                case Channel::A:
-                    if(format == Format::RGBA8888)
-                    {
-                        _lut_index = 3;
-                        _func      = &NEChannelExtractKernel::extract_1C_from_4C_img;
-                        break;
-                    }
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel for this format.");
-                    break;
-            }
-            break;
-        case Format::YUYV422:
-        case Format::UYVY422:
-            output_shape = input->info()->tensor_shape();
-
-            if(channel != Channel::Y)
-            {
-                output_shape.set(0, output_shape[0] / 2);
-            }
-
-            switch(channel)
-            {
-                case Channel::Y:
-                    num_elems_processed_per_iteration = 16;
-                    _func                             = &NEChannelExtractKernel::extract_1C_from_2C_img;
-                    _lut_index                        = (Format::YUYV422 == format) ? 0 : 1;
-                    break;
-                case Channel::U:
-                    num_elems_processed_per_iteration = 32;
-                    _func                             = &NEChannelExtractKernel::extract_YUYV_uv;
-                    _lut_index                        = (Format::YUYV422 == format) ? 1 : 0;
-                    break;
-                case Channel::V:
-                    num_elems_processed_per_iteration = 32;
-                    _func                             = &NEChannelExtractKernel::extract_YUYV_uv;
-                    _lut_index                        = (Format::YUYV422 == format) ? 3 : 2;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel for this format.");
-                    break;
-            }
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported format.");
-            break;
+        if(channel != Channel::Y)
+        {
+            subsampling = 2;
+        }
     }
 
+    TensorShape output_shape = calculate_subsampled_shape(input->info()->tensor_shape(), format, channel);
     set_shape_if_empty(*output->info(), output_shape);
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output_shape, output->info()->tensor_shape());
 
-    _input  = input;
-    _output = output;
+    _input     = input;
+    _output    = output;
+    _lut_index = channel_idx_from_format(format, channel);
 
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    unsigned int num_elems_processed_per_iteration = 16;
+
+    if(format == Format::YUYV422 || format == Format::UYVY422)
+    {
+        _func = &NEChannelExtractKernel::extract_1C_from_2C_img;
+
+        if(channel != Channel::Y) // Channel::U or Channel::V
+        {
+            num_elems_processed_per_iteration = 32;
+            _func                             = &NEChannelExtractKernel::extract_YUYV_uv;
+        }
+    }
+    else // Format::RGB888 or Format::RGBA8888
+    {
+        _func = &NEChannelExtractKernel::extract_1C_from_3C_img;
+
+        if(format == Format::RGBA8888)
+        {
+            _func = &NEChannelExtractKernel::extract_1C_from_4C_img;
+        }
+    }
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
     AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
     AccessWindowRectangle  output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / subsampling, 1.f / subsampling);
-
     update_window_and_padding(win, input_access, output_access);
 
     ValidRegion input_valid_region = input->info()->valid_region();
-
     output_access.set_valid_region(win, ValidRegion(input_valid_region.anchor, output->info()->tensor_shape()));
 
     INEKernel::configure(win);
@@ -168,94 +130,45 @@
 
     set_format_if_unknown(*output->info(), Format::U8);
 
-    switch(input->info()->format())
+    const Format format = input->info()->format();
+    ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel);
+
+    // Get input plane
+    const IImage *input_plane = input->plane(plane_idx_from_channel(format, channel));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input_plane);
+
+    if(Channel::Y == channel && format != Format::YUV444)
     {
-        case Format::NV12:
-        case Format::NV21:
-        case Format::IYUV:
-            switch(channel)
-            {
-                case Channel::Y:
-                    set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape());
-                    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output);
-                    break;
-                case Channel::U:
-                case Channel::V:
-                    set_shape_if_empty(*output->info(), input->plane(1)->info()->tensor_shape());
-                    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(1), output);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported channel for selected format");
-            }
-            break;
-        case Format::YUV444:
-            set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape());
-            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported format");
+        // Check if the width of the tensor shape is even for formats with subsampled channels (UYVY422 and YUYV422)
+        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(format, input_plane);
     }
 
+    // Calculate 2x2 subsampled tensor shape
+    TensorShape output_shape = calculate_subsampled_shape(input->plane(0)->info()->tensor_shape(), format, channel);
+    set_shape_if_empty(*output->info(), output_shape);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output_shape, output->info()->tensor_shape());
+
+    // Check if input tensor has a valid format
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
 
+    _input     = input_plane;
+    _output    = output;
+    _lut_index = channel_idx_from_format(format, channel);
+
     unsigned int num_elems_processed_per_iteration = 32;
 
-    const Format &format = input->info()->format();
+    _func = &NEChannelExtractKernel::copy_plane;
 
-    switch(format)
+    if((format == Format::NV12 || format == Format::NV21) && channel != Channel::Y)
     {
-        case Format::NV12:
-        case Format::NV21:
-            switch(channel)
-            {
-                case Channel::Y:
-                    _input = input->plane(0);
-                    _func  = &NEChannelExtractKernel::copy_plane;
-                    break;
-                case Channel::U:
-                    _input                            = input->plane(1);
-                    num_elems_processed_per_iteration = 16;
-                    _func                             = &NEChannelExtractKernel::extract_1C_from_2C_img;
-                    _lut_index                        = (Format::NV12 == format) ? 0 : 1;
-                    break;
-                case Channel::V:
-                    _input                            = input->plane(1);
-                    num_elems_processed_per_iteration = 16;
-                    _func                             = &NEChannelExtractKernel::extract_1C_from_2C_img;
-                    _lut_index                        = (Format::NV12 == format) ? 1 : 0;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel for this format.");
-                    break;
-            }
-            break;
-        case Format::IYUV:
-        case Format::YUV444:
-            _func = &NEChannelExtractKernel::copy_plane;
-            switch(channel)
-            {
-                case Channel::Y:
-                    _input = input->plane(0);
-                    break;
-                case Channel::U:
-                    _input = input->plane(1);
-                    break;
-                case Channel::V:
-                    _input = input->plane(2);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel for this format.");
-                    break;
-            }
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported format.");
-            break;
+        num_elems_processed_per_iteration = 16;
+        _func                             = &NEChannelExtractKernel::extract_1C_from_2C_img;
     }
 
-    _output                    = output;
-    Window                 win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
+    Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
+
     AccessWindowHorizontal input_access(_input->info(), 0, num_elems_processed_per_iteration);
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
     update_window_and_padding(win, input_access, output_access);

diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp
index 7468f58..0a10546 100644
--- a/src/core/NEON/kernels/NEConvolutionKernel.cpp
+++ b/src/core/NEON/kernels/NEConvolutionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -1456,8 +1456,8 @@
     constexpr unsigned int num_elems_read_per_iteration      = 16;
     constexpr unsigned int num_elems_written_per_iteration   = 8;
 
-    Window                 win           = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, _border_size);
-    AccessWindowHorizontal output_access = AccessWindowHorizontal(output->info(), 0, num_elems_written_per_iteration);
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, _border_size);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
 
     update_window_and_padding(win,
                               AccessWindowRectangle(input->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, height),

diff --git a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
index ca22af0..d2eac2c 100644
--- a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
+++ b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -102,7 +102,7 @@
     }
     else
     {
-        const float diff = image_size - cd_min;
+        const float diff = image_size - 1;
 
         for(unsigned int x = 0; x < _histogram_size; ++x)
         {

diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index bc2f1ed..f5ee608 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp

@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h"
+#include "arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/AccessWindowTranspose.h"
@@ -34,13 +34,16 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 using namespace arm_compute::detail;
 using namespace arm_compute::misc::shape_calculator;
+using namespace depthwise;
 
 namespace
 {
@@ -143,7 +146,7 @@
 } // namespace
 
 NEDepthwiseConvolutionLayer3x3Kernel::NEDepthwiseConvolutionLayer3x3Kernel()
-    : _border_size(0), _input(), _output(), _weights(), _conv_info(), _num_elems_written_per_iteration(0)
+    : _border_size(0), _input(), _output(), _weights(), _conv_info(), _convolver(nullptr), _num_elems_written_per_iteration(0), _run_optimized(false)
 {
 }
 
@@ -152,35 +155,99 @@
     return _border_size;
 }
 
-void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, DataLayout data_layout)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
+
+    _input     = input;
+    _output    = output;
+    _weights   = weights;
+    _conv_info = conv_info;
+    _convolver = nullptr;
+
+    _run_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
+                                                                                           conv_info,
+                                                                                           input->info()->data_type(),
+                                                                                           data_layout);
+
+    (_run_optimized) ? configure_optimized() : configure_generic();
+}
+
+void NEDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_UNUSED(info);
+
+    (_run_optimized) ? run_optimized(window, info) : run_generic(window, info);
+}
+
+bool NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(TensorShape input_shape, PadStrideInfo conv_info, DataType dt, DataLayout data_layout)
+{
+    // Reshape input shape if in NHWC format
+    TensorShape in_shape{ input_shape };
+    if(data_layout == DataLayout::NHWC)
+    {
+        in_shape.set(Window::DimX, input_shape.y());
+        in_shape.set(Window::DimY, input_shape.z());
+        in_shape.set(Window::DimZ, input_shape.x());
+    }
+
+    // Check supported data type
+    bool supported_datatype = (dt == DataType::F32);
+
+    // Check for supported strides
+    const auto &strides           = conv_info.stride();
+    bool        supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));
+
+    // Check for supported padding
+    const auto    pad_top           = conv_info.pad_top();
+    const auto    pad_right         = conv_info.pad_right();
+    const auto    pad_bottom        = conv_info.pad_bottom();
+    const auto    pad_left          = conv_info.pad_left();
+    PadStrideInfo same_pad          = calculate_same_pad(in_shape, TensorShape(3U, 3U), conv_info);
+    bool          is_same_padding   = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
+    bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
+    bool          supported_padding = is_same_padding || is_valid_padding;
+
+    return supported_datatype && supported_strides && supported_padding;
+}
+
+void NEDepthwiseConvolutionLayer3x3Kernel::generate_convolver()
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(_input, _weights);
+    ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3);
+
+    _convolver = create_convolver_object(_input->info()->tensor_shape(), _conv_info,
+                                         _weights->buffer(), _input->buffer(), _output->buffer());
+}
+
+void NEDepthwiseConvolutionLayer3x3Kernel::configure_generic()
+{
+    ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(0) != 3 || _weights->info()->dimension(1) != 3);
 
     // Get convolved dimensions
-    const TensorShape output_shape = compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info);
-    const DataType    output_dt    = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
+    const TensorShape output_shape = compute_depthwise_convolution_shape(*_input->info(), *_weights->info(), _conv_info);
+    const DataType    output_dt    = (_input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : _input->info()->data_type();
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(),
-                       input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt));
+    auto_init_if_empty(*_output->info(),
+                       _input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt));
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(_output->info()->tensor_shape(), output_shape);
 
-    _input                           = input;
-    _output                          = output;
-    _weights                         = weights;
-    _conv_info                       = conv_info;
-    const unsigned int conv_stride_x = conv_info.stride().first;
-    const unsigned int conv_stride_y = conv_info.stride().second;
-    const unsigned int conv_pad_left = conv_info.pad_left();
-    const unsigned int conv_pad_top  = conv_info.pad_top();
+    const unsigned int conv_stride_x   = _conv_info.stride().first;
+    const unsigned int conv_stride_y   = _conv_info.stride().second;
+    const unsigned int conv_pad_top    = _conv_info.pad_top();
+    const unsigned int conv_pad_right  = _conv_info.pad_right();
+    const unsigned int conv_pad_bottom = _conv_info.pad_bottom();
+    const unsigned int conv_pad_left   = _conv_info.pad_left();
 
     ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 3);
 
     unsigned int num_elems_read_per_iteration = 0;
-    switch(input->info()->data_type())
+    switch(_input->info()->data_type())
     {
         case DataType::QASYMM8:
             num_elems_read_per_iteration     = 16;
@@ -193,31 +260,51 @@
         default:
             ARM_COMPUTE_ERROR("Data type not supported.");
     }
-    _border_size = BorderSize(conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), conv_pad_left);
+    _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
 
     // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
+    Window win = calculate_max_window(*_output->info(), Steps(_num_elems_written_per_iteration));
 
-    const unsigned int num_x_steps               = (output_shape.x() + _num_elems_written_per_iteration - 1) / _num_elems_written_per_iteration;
-    const int          input_num_elems_processed = get_input_num_elems_processed(_num_elems_written_per_iteration, conv_stride_x);
-
-    AccessWindowStatic input_access(input->info(),
-                                    -conv_pad_left,
-                                    -conv_pad_top,
-                                    (num_x_steps - 1) * input_num_elems_processed + num_elems_read_per_iteration,
-                                    conv_stride_y * (output_shape.y() - 1) + 2);
-    AccessWindowStatic weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1));
-    AccessWindowStatic output_access(output->info(), 0, 0, num_x_steps * _num_elems_written_per_iteration, output_shape.y());
+    AccessWindowRectangle input_access(_input->info(), -conv_pad_left, -conv_pad_top,
+                                       num_elems_read_per_iteration, 3,
+                                       conv_stride_x, conv_stride_y);
+    AccessWindowStatic     weights_access(_weights->info(), 0, 0, 3, 3);
+    AccessWindowHorizontal output_access(_output->info(), 0, _num_elems_written_per_iteration);
 
     update_window_and_padding(win, input_access, weights_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), _output->info()->tensor_shape()));
 
     INEKernel::configure(win);
 }
 
-void NEDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, const ThreadInfo &info)
+void NEDepthwiseConvolutionLayer3x3Kernel::configure_optimized()
 {
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3);
+
+    _border_size = BorderSize(0, 0);
+    _convolver   = create_convolver_object(_input->info()->tensor_shape(), _conv_info,
+                                           _weights->buffer(), _input->buffer(), _output->buffer());
+
+    // Auto-configure output
+    bool        same_padding = _conv_info.has_padding();
+    TensorShape output_shape{ _input->info()->tensor_shape() };
+
+    output_shape.set(1, _convolver->output_size(output_shape.y(), same_padding)); // Set width
+    output_shape.set(2, _convolver->output_size(output_shape.z(), same_padding)); // Set height
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*_output->info(),
+                       _input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+
+    // Configure window
+    Window win;
+    auto   win_last = _convolver->get_window();
+    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+    INEKernel::configure(win);
+}
+
+void NEDepthwiseConvolutionLayer3x3Kernel::run_generic(const Window &window, const ThreadInfo &info)
+{
     ARM_COMPUTE_UNUSED(info);
 
     switch(_input->info()->data_type())
@@ -232,3 +319,53 @@
             ARM_COMPUTE_ERROR("Not implemented");
     }
 }
+
+void NEDepthwiseConvolutionLayer3x3Kernel::run_optimized(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON(!_convolver);
+
+    const size_t start = window.x().start();
+    const size_t end   = window.x().end();
+    _convolver->run(start, end);
+}
+
+std::unique_ptr<depthwise::IDepthwiseConvolution> NEDepthwiseConvolutionLayer3x3Kernel::create_convolver_object(TensorShape    shape,
+                                                                                                                PadStrideInfo  conv_info,
+                                                                                                                const uint8_t *w_ptr,
+                                                                                                                uint8_t       *in_ptr,
+                                                                                                                uint8_t       *out_ptr)
+{
+    const int  in_rows      = shape.z();
+    const int  in_cols      = shape.y();
+    const int  n_batches    = shape[3];
+    const int  n_channels   = shape.x();
+    const bool padding_same = conv_info.has_padding();
+
+    const auto stride_x = conv_info.stride().first;
+    switch(stride_x)
+    {
+        case 1:
+            return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float>>(
+                       n_batches,
+                       in_rows,
+                       in_cols,
+                       n_channels,
+                       padding_same,
+                       reinterpret_cast<const float *>(w_ptr),
+                       reinterpret_cast<float *>(in_ptr),
+                       reinterpret_cast<float *>(out_ptr));
+        case 2:
+            return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float>>(
+                       n_batches,
+                       in_rows,
+                       in_cols,
+                       n_channels,
+                       padding_same,
+                       reinterpret_cast<const float *>(w_ptr),
+                       reinterpret_cast<float *>(in_ptr),
+                       reinterpret_cast<float *>(out_ptr));
+        default:
+            return nullptr;
+    }
+}
\ No newline at end of file

diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
index 2ceb39d..b924d9f 100644
--- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,40 +37,9 @@
 
 using namespace arm_compute;
 
-NEDepthwiseIm2ColKernel::NEDepthwiseIm2ColKernel()
-    : _input(nullptr), _output(nullptr), _kernel_dims(), _conv_info(), _has_bias()
+template <typename T>
+void NEDepthwiseIm2ColKernel::run_generic(const Window &window)
 {
-}
-
-void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
-
-    _input       = input;
-    _output      = output;
-    _kernel_dims = kernel_dims;
-    _conv_info   = conv_info;
-    _has_bias    = has_bias;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
-
-    // The NEDepthwiseIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-void NEDepthwiseIm2ColKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-
-    //const int kernel_depth   = _input->info()->dimension(2);
     const int input_w        = _input->info()->dimension(0);
     const int input_h        = _input->info()->dimension(1);
     const int input_stride_x = _input->info()->strides_in_bytes().x();
@@ -101,6 +70,13 @@
     const int full_length   = input_w + pad_left + pad_right;
     const int max_initial_x = stride_x * (((full_length - _kernel_dims.width) / stride_x) + 1);
 
+    // Define pad value
+    auto zero = static_cast<T>(0);
+    if(std::is_same<T, uint8_t>::value)
+    {
+        zero = _input->info()->quantization_info().offset;
+    }
+
     execute_window_loop(window_out, [&](const Coordinates & id)
     {
         const int src_pixel_linear = id.y() * stride_x;
@@ -110,7 +86,7 @@
 
         // Get pointers
         const uint8_t *const input_ptr  = in.ptr() + id.z() * input_stride_z;
-        auto                 output_ptr = reinterpret_cast<float *>(out.ptr());
+        auto                 output_ptr = reinterpret_cast<T *>(out.ptr());
         const int            height     = src_y + _kernel_dims.height;
         const int            width      = src_x + _kernel_dims.width;
 
@@ -120,19 +96,76 @@
             {
                 if(x < 0 || x >= input_w || y < 0 || y >= input_h)
                 {
-                    *output_ptr = 0;
+                    *output_ptr = zero;
                 }
                 else
                 {
-                    *output_ptr = *(reinterpret_cast<const float *>(input_ptr + x * input_stride_x + y * input_stride_y));
+                    *output_ptr = *(reinterpret_cast<const T *>(input_ptr + x * input_stride_x + y * input_stride_y));
                 }
             }
         }
 
         if(_has_bias)
         {
-            *output_ptr = static_cast<float>(1);
+            *output_ptr = static_cast<T>(1);
         }
     },
     in, out);
 }
+
+NEDepthwiseIm2ColKernel::NEDepthwiseIm2ColKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _kernel_dims(), _conv_info(), _has_bias()
+{
+}
+
+void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && has_bias);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
+
+    _input       = input;
+    _output      = output;
+    _kernel_dims = kernel_dims;
+    _conv_info   = conv_info;
+    _has_bias    = has_bias;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+
+    // Set appropriate function to run
+    switch(input->info()->data_type())
+    {
+        case DataType::QASYMM8:
+            _func = &NEDepthwiseIm2ColKernel::run_generic<uint8_t>;
+            break;
+        case DataType::F16:
+            _func = &NEDepthwiseIm2ColKernel::run_generic<half>;
+            break;
+        case DataType::F32:
+            _func = &NEDepthwiseIm2ColKernel::run_generic<float>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type");
+    }
+
+    // The NEDepthwiseIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEDepthwiseIm2ColKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    if(_func != nullptr)
+    {
+        (this->*_func)(window);
+    }
+}

diff --git a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
index 9b36df3..8960d8a 100644
--- a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,45 +37,9 @@
 
 using namespace arm_compute;
 
-NEDepthwiseVectorToTensorKernel::NEDepthwiseVectorToTensorKernel()
-    : _input(nullptr), _output(nullptr), _conv_dims()
+template <typename T>
+void NEDepthwiseVectorToTensorKernel::vector_to_tensor(const Window &window)
 {
-}
-
-void NEDepthwiseVectorToTensorKernel::configure(const ITensor *input, ITensor *output, size_t conv_w, size_t conv_h)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    TensorShape output_shape = input->info()->tensor_shape();
-    output_shape.set(0, conv_w);
-    output_shape.set(1, conv_h);
-    output_shape.set(2, input->info()->tensor_shape()[0] / (conv_w * conv_h));
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
-
-    _input     = input;
-    _output    = output;
-    _conv_dims = std::pair<size_t, size_t>(conv_w, conv_h);
-
-    // Configure  kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
-    // The NEDepthwisevectorToTensorKernel doesn't need padding so update_window_and_padding() can be skipped
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-void NEDepthwiseVectorToTensorKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-
     // const int input_w         = _input->info()->dimension(0);
     const int output_stride_x = _output->info()->strides_in_bytes().x();
     const int output_stride_y = _output->info()->strides_in_bytes().y();
@@ -97,10 +61,75 @@
         const int z       = id.x() / patch_size;
         const int index2D = id.x() - z * patch_size;
 
-        auto input_ptr  = reinterpret_cast<float *>(in.ptr());
-        auto output_ptr = reinterpret_cast<float *>(out.ptr() + index2D % _conv_dims.first * output_stride_x + index2D / _conv_dims.first * output_stride_y + z * output_stride_z);
+        auto input_ptr  = reinterpret_cast<T *>(in.ptr());
+        auto output_ptr = reinterpret_cast<T *>(out.ptr() + index2D % _conv_dims.first * output_stride_x + index2D / _conv_dims.first * output_stride_y + z * output_stride_z);
 
         *output_ptr = *input_ptr;
     },
     in, out);
 }
+
+NEDepthwiseVectorToTensorKernel::NEDepthwiseVectorToTensorKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _conv_dims()
+{
+}
+
+void NEDepthwiseVectorToTensorKernel::configure(const ITensor *input, ITensor *output, size_t conv_w, size_t conv_h)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    TensorShape output_shape = input->info()->tensor_shape();
+    output_shape.set(0, conv_w);
+    output_shape.set(1, conv_h);
+    output_shape.set(2, input->info()->tensor_shape()[0] / (conv_w * conv_h));
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    _input     = input;
+    _output    = output;
+    _conv_dims = std::pair<size_t, size_t>(conv_w, conv_h);
+
+    // Set appropriate function to run
+    switch(input->info()->data_type())
+    {
+        case DataType::QASYMM8:
+            _func = &NEDepthwiseVectorToTensorKernel::vector_to_tensor<uint8_t>;
+            break;
+        case DataType::S32:
+            _func = &NEDepthwiseVectorToTensorKernel::vector_to_tensor<int32_t>;
+            break;
+        case DataType::F16:
+            _func = &NEDepthwiseVectorToTensorKernel::vector_to_tensor<half>;
+            break;
+        case DataType::F32:
+            _func = &NEDepthwiseVectorToTensorKernel::vector_to_tensor<float>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type");
+    }
+
+    // Configure  kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+    // The NEDepthwisevectorToTensorKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEDepthwiseVectorToTensorKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    if(_func != nullptr)
+    {
+        (this->*_func)(window);
+    }
+}

diff --git a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
index 6585fdb..36b17bf 100644
--- a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,16 +37,59 @@
 
 using namespace arm_compute;
 
+namespace
+{
+template <typename T>
+void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window)
+{
+    const int input_w         = input->info()->dimension(0);
+    const int output_stride_x = output->info()->strides_in_bytes().x();
+    const int output_stride_y = output->info()->strides_in_bytes().y();
+
+    Window window_in(window);
+    // The first three dimensions of the input are increased by the inner loops
+    window_in.set(Window::DimX, Window::Dimension(0, input->info()->dimension(0), input->info()->dimension(0)));
+    window_in.set(Window::DimY, Window::Dimension(0, input->info()->dimension(1), 1));
+    window_in.set(Window::DimZ, Window::Dimension(0, input->info()->dimension(2), 1));
+
+    // Setup output window
+    Window window_out;
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(input, window_in);
+    Iterator out(output, window_out);
+
+    execute_window_loop(window_in, [&](const Coordinates & id)
+    {
+        auto input_ptr  = reinterpret_cast<T *>(in.ptr());
+        auto output_ptr = reinterpret_cast<T *>(out.ptr() + id.y() * input_w * output_stride_x + id.z() * output_stride_y);
+
+        for(int i = 0; i < input_w; ++i, ++input_ptr)
+        {
+            *(output_ptr + i) = *input_ptr;
+        }
+
+        if(bias != nullptr)
+        {
+            *(output_ptr + input_w) = *(reinterpret_cast<T *>(bias->ptr_to_element(Coordinates(id.z()))));
+        }
+    },
+    in, out);
+}
+} // namespace
+
 NEDepthwiseWeightsReshapeKernel::NEDepthwiseWeightsReshapeKernel()
-    : _input(nullptr), _output(nullptr), _biases(nullptr)
+    : _func(nullptr), _input(nullptr), _output(nullptr), _biases(nullptr)
 {
 }
 
 void NEDepthwiseWeightsReshapeKernel::configure(const ITensor *input, ITensor *output, const ITensor *biases)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && (biases != nullptr));
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(1));
     ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) + ((biases != nullptr) ? 1 : 0)));
 
@@ -62,6 +105,30 @@
     _output = output;
     _biases = biases;
 
+    switch(_input->info()->element_size())
+    {
+        case 4:
+        {
+            _func = &weights_reshape<uint32_t>;
+            break;
+        }
+        case 2:
+        {
+            _func = &weights_reshape<uint16_t>;
+            break;
+        }
+        case 1:
+        {
+            _func = &weights_reshape<uint8_t>;
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR_ON("Element size not supported");
+            break;
+        }
+    }
+
     // Configure  kernel window
     Window win = calculate_max_window(*input->info(), Steps());
     // The NEDepthwiseWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
@@ -74,39 +141,10 @@
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const int input_w         = _input->info()->dimension(0);
-    const int output_stride_x = _output->info()->strides_in_bytes().x();
-    const int output_stride_y = _output->info()->strides_in_bytes().y();
-
-    Window window_in(window);
-    // The first three dimensions of the input are increased by the inner loops
-    window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
-    window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1));
-    window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), 1));
-
-    // Setup output window
-    Window window_out;
-    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator in(_input, window_in);
-    Iterator out(_output, window_out);
-
-    execute_window_loop(window_in, [&](const Coordinates & id)
+    if(_func != nullptr)
     {
-        auto input_ptr  = reinterpret_cast<float *>(in.ptr());
-        auto output_ptr = reinterpret_cast<float *>(out.ptr() + id.y() * input_w * output_stride_x + id.z() * output_stride_y);
-
-        for(int i = 0; i < input_w; ++i, ++input_ptr)
-        {
-            *(output_ptr + i) = *input_ptr;
-        }
-
-        if(_biases != nullptr)
-        {
-            *(output_ptr + input_w) = *(reinterpret_cast<float *>(_biases->ptr_to_element(Coordinates(id.z()))));
-        }
-    },
-    in, out);
+        (*_func)(_input, _biases, _output, window);
+    }
 }

diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
index 70984f0..be211b2 100644
--- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -78,13 +78,11 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
     Window window_input_output(window);
-    window_input_output.collapse_if_possible(INEKernel::window(), 3);
     window_input_output.set(3, Window::Dimension(0, 1, 1));
 
     Window window_min_max;
     window_min_max.use_tensor_dimensions(_min_max->info()->tensor_shape());
     window_min_max.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_min_max.collapse_if_possible(INEKernel::window(), 1);
 
     Iterator input(_input, window_input_output);
     Iterator output(_output, window_input_output);

diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index cb8246d..285ec2d 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp

@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h"
+#include "arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
@@ -274,6 +274,7 @@
         ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimX) > small_tensor_size_optim);
         ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimY) > small_tensor_size_optim);
 
+        const int          input_stride_x  = input->info()->strides_in_bytes().x();
         const int          input_stride_y  = input->info()->strides_in_bytes().y();
         const int          input_stride_z  = input->info()->strides_in_bytes().z();
         const int          output_stride_y = output->info()->strides_in_bytes().y();
@@ -284,6 +285,8 @@
         const int          range_z         = window.z().end() - window.z().start();
         const int          kernel_depth    = weights->info()->dimension(Window::DimZ);
         const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
+        const unsigned int conv_pad_left   = conv_info.pad_left();
+        const unsigned int conv_pad_top    = conv_info.pad_top();
 
         // setup output window for the iterator
         Window window_out = window;
@@ -307,7 +310,7 @@
 
         execute_window_loop(window_out, [&](const Coordinates & id)
         {
-            const uint8_t *input_ptr                       = in.ptr();
+            const uint8_t *input_ptr                       = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
             uint8_t       *out_ptr                         = out.ptr();
             int            ih                              = 0;
             int            oh                              = 0;
@@ -351,6 +354,7 @@
     static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
                          const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
     {
+        const int          input_stride_x       = input->info()->strides_in_bytes().x();
         const int          input_stride_y       = input->info()->strides_in_bytes().y();
         const int          input_stride_z       = input->info()->strides_in_bytes().z();
         const int          output_stride_y      = output->info()->strides_in_bytes().y();
@@ -362,6 +366,8 @@
         const int          range_z              = window.z().end() - window.z().start();
         const int          kernel_depth         = weights->info()->dimension(Window::DimZ);
         const unsigned int conv_stride_y        = std::get<1>(conv_info.stride());
+        const unsigned int conv_pad_left        = conv_info.pad_left();
+        const unsigned int conv_pad_top         = conv_info.pad_top();
         const int          fixed_point_position = input->info()->fixed_point_position();
 
         // setup output window for the iterator
@@ -389,7 +395,7 @@
             /*
                 For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1>
             */
-            const uint8_t *input_ptr = in.ptr();
+            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
             uint8_t       *out_ptr   = out.ptr();
             int            ih        = 0;
             int            oh        = 0;
@@ -680,8 +686,8 @@
         const int          delta_input          = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
         const int          kernel_depth         = weights->info()->dimension(Window::DimZ);
         const unsigned int conv_stride_y        = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_x           = std::get<0>(conv_info.pad());
-        const unsigned int conv_pad_y           = std::get<1>(conv_info.pad());
+        const unsigned int conv_pad_left        = conv_info.pad_left();
+        const unsigned int conv_pad_top         = conv_info.pad_top();
         const int          fixed_point_position = input->info()->fixed_point_position();
 
         // setup output window for the iterator
@@ -707,7 +713,7 @@
 
         execute_window_loop(window_out, [&](const Coordinates & id)
         {
-            const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;
+            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
             uint8_t       *out_ptr   = out.ptr();
             int            ih        = 0;
             int            oh        = 0;
@@ -804,8 +810,8 @@
         const int          delta_input          = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
         const int          kernel_depth         = weights->info()->dimension(Window::DimZ);
         const unsigned int conv_stride_y        = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_x           = std::get<0>(conv_info.pad());
-        const unsigned int conv_pad_y           = std::get<1>(conv_info.pad());
+        const unsigned int conv_pad_left        = conv_info.pad_left();
+        const unsigned int conv_pad_top         = conv_info.pad_top();
         const int          fixed_point_position = input->info()->fixed_point_position();
 
         // setup output window for the iterator
@@ -831,7 +837,7 @@
 
         execute_window_loop(window_out, [&](const Coordinates & id)
         {
-            const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;
+            const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
             uint8_t       *out_ptr   = out.ptr();
             int            ih        = 0;
             int            oh        = 0;
@@ -1016,12 +1022,6 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())),
-                                    "Pad > 0 not supported for 1x1 weights");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1),
-                                    "Pad > 1 not supported for 3x3 weights");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) == 5 && (std::get<0>(conv_info.pad()) > 2 || std::get<1>(conv_info.pad()) > 2),
-                                    "Pad > 2 not supported for 5x5 weights");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
@@ -1051,11 +1051,10 @@
                                                         unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size)
 {
     // Calculate right and bottom border
-    unsigned int       kernel_size   = weights->dimension(0);
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
-    const int          input_width   = input->dimension(0);
-    const int          input_height  = input->dimension(1);
+    unsigned int kernel_size   = weights->dimension(0);
+    const int    conv_stride_x = std::get<0>(conv_info.stride());
+    const int    conv_stride_y = std::get<1>(conv_info.stride());
+    const int    input_width   = input->dimension(0);
 
     switch(kernel_size)
     {
@@ -1120,22 +1119,28 @@
         }
     }
 
+    // Calculate right pad
+    int start_x       = kernel_size / 2 - static_cast<int>(conv_info.pad_left());
+    int end_x         = ceil_to_multiple(static_cast<int>(output->dimension(0)), num_elems_written_per_iteration) * conv_stride_x;
+    int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width;
+
     // Calculate border
-    int upper_bound_w = ceil_to_multiple(((output->dimension(0) - 1) * conv_stride_x + kernel_size), num_elems_read_per_iteration) - conv_info.pad_left() - conv_info.pad_right() - input_width;
-    int upper_bound_h = ((output->dimension(1) - 1) * conv_stride_y - conv_info.pad_top() - conv_info.pad_bottom() + kernel_size) - input_height;
+    const unsigned int conv_pad_left   = conv_info.pad_left();
+    const unsigned int conv_pad_top    = conv_info.pad_top();
+    const unsigned int conv_pad_right  = std::max(upper_bound_w, 0);
+    const unsigned int conv_pad_bottom = conv_info.pad_bottom();
 
-    const unsigned int conv_pad_left   = std::max(upper_bound_w - static_cast<int>(conv_info.pad_right()), static_cast<int>(kernel_size) / 2);
-    const unsigned int conv_pad_top    = std::max(upper_bound_h - static_cast<int>(conv_info.pad_bottom()), static_cast<int>(kernel_size) / 2);
-    const unsigned int conv_pad_right  = std::max(upper_bound_w - static_cast<int>(conv_info.pad_left()), static_cast<int>(kernel_size) / 2);
-    const unsigned int conv_pad_bottom = std::max(upper_bound_h - static_cast<int>(conv_info.pad_top()), static_cast<int>(kernel_size) / 2);
-
-    border_size.right  = conv_pad_right;
-    border_size.bottom = conv_pad_bottom;
     border_size.left   = conv_pad_left;
     border_size.top    = conv_pad_top;
+    border_size.right  = conv_pad_right;
+    border_size.bottom = conv_pad_bottom;
 
-    Window                 win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
-    AccessWindowStatic     input_access(input, -conv_pad_left, -conv_pad_top, input_width + conv_pad_right, input_height + conv_pad_bottom);
+    // Configure window
+    Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
+
+    AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top,
+                                       num_elems_read_per_iteration, kernel_size,
+                                       conv_stride_x, conv_stride_y);
     AccessWindowStatic     weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
     AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
     bool                   window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
@@ -1202,7 +1207,7 @@
     unsigned int num_weight_elems_read_per_row   = 0;
     unsigned int num_elems_read_per_iteration    = 0;
     unsigned int num_elems_written_per_iteration = 0;
-    BorderSize   border_size(conv_info.pad().first, conv_info.pad().second);
+    BorderSize   border_size                     = {};
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
                                                               weights->clone().get(),

diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 52880a3..08d8f8c 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp

@@ -59,6 +59,10 @@
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS32 && bias->data_type() != DataType::QS16, "Wrong data type for bias");
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, bias);
         }
+        else if(is_data_type_quantized_asymmetric(input->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        }
         else
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
@@ -68,7 +72,7 @@
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_data_type_quantized(input->data_type()), "Calling output stage kernel with floating point arguments");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_float(input->data_type()), "Calling output stage kernel with floating point arguments");
     }
 
     // Checks performed when output is configured
@@ -447,8 +451,10 @@
             break;
         }
         case DataType::S32:
+        {
             _func = (bias == nullptr) ? &output_stage<int32_t, uint8_t, false, false> : &output_stage<int32_t, uint8_t, false, true>;
             break;
+        }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
         {

diff --git a/src/core/NEON/kernels/NEFillArrayKernel.cpp b/src/core/NEON/kernels/NEFillArrayKernel.cpp
index 5a2e1a0..c6c6c45 100644
--- a/src/core/NEON/kernels/NEFillArrayKernel.cpp
+++ b/src/core/NEON/kernels/NEFillArrayKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,6 +81,9 @@
             p.y               = id.y();
             p.strength        = value;
             p.tracking_status = 1;
+            p.scale           = 0.f;
+            p.orientation     = 0.f;
+            p.error           = 0.f;
 
             if(!_output->push_back(p))
             {

diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 2f8afd8..12755a4 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <arm_neon.h>
 #include <cstddef>
@@ -37,6 +38,7 @@
 #include <tuple>
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
@@ -178,12 +180,8 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = input->info()->tensor_shape();
-    output_shape.set(0, input->info()->dimension(0) * 4);
-    output_shape.set(1, std::ceil(input->info()->dimension(1) / 4.0f));
-
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(*input->info())));
 
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));

diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
index 3d41548..ee334df 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -143,13 +143,10 @@
     // If a_offset == 0, vector_sum_col can be a nullptr
     if(a_offset != 0)
     {
-        TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); // NOLINT
-        vector_sum_col_shape.collapse(1);
-
         // Check if vector_sum_col_shape should be slidden or not
         // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
         // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        _slide_vector_sum_col = vector_sum_col_shape[1] != 1;
+        _slide_vector_sum_col = vector_sum_col->info()->tensor_shape().num_dimensions() > 1;
     }
 
     // Configure kernel window
@@ -201,7 +198,7 @@
         Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row);
         Iterator mm_result(_mm_result, window);
 
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(collapsed_window, [&](const Coordinates & id)
         {
             // Compute the leftover term due to a_offset.
             int32x4x4_t a_offset_term_s32 =

diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
index 3dd59bd..cab3c7a 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,6 +39,42 @@
 
 using namespace arm_compute;
 
+namespace
+{
+inline Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(biases, accum);
+    ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != accum->dimension(0));
+
+    return Status{};
+}
+
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *accum, ITensorInfo *biases)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
+
+    bool window_changed = update_window_and_padding(win,
+                                                    AccessWindowHorizontal(accum, 0, num_elems_processed_per_iteration),
+                                                    AccessWindowStatic(biases, 0, 0, ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration), biases->tensor_shape().y()));
+
+    AccessWindowHorizontal output_access(accum, 0, num_elems_processed_per_iteration);
+
+    // Set the valid region for the accum tensor
+    Coordinates coord;
+    coord.set_num_dimensions(accum->num_dimensions());
+    output_access.set_valid_region(win, ValidRegion(coord, accum->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 NEGEMMMatrixAccumulateBiasesKernel::NEGEMMMatrixAccumulateBiasesKernel()
     : _accum(nullptr), _biases(nullptr)
 {
@@ -46,31 +82,26 @@
 
 void NEGEMMMatrixAccumulateBiasesKernel::configure(ITensor *accum, const ITensor *biases)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(biases, accum);
-    ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases);
+
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info()));
 
     _biases = biases;
     _accum  = accum;
 
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
     // Configure kernel window
-    Window win = calculate_max_window(*accum->info(), Steps(num_elems_processed_per_iteration));
+    auto win_config = validate_and_configure_window(accum->info(), biases->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(accum->info(), 0, num_elems_processed_per_iteration),
-                              AccessWindowStatic(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration), biases->info()->tensor_shape().y()));
+Status NEGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum, const ITensorInfo *biases)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(accum->clone().get(), biases->clone().get()).first);
 
-    AccessWindowHorizontal output_access(accum->info(), 0, num_elems_processed_per_iteration);
-
-    // Set the valid region for the accum tensor
-    Coordinates coord;
-    coord.set_num_dimensions(accum->info()->num_dimensions());
-    output_access.set_valid_region(win, ValidRegion(coord, accum->info()->tensor_shape()));
-
-    INEKernel::configure(win);
+    return Status{};
 }
 
 void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index aa5e2dd..69b052a 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,6 +36,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include <arm_neon.h>
 #include <cstddef>
 #include <cstdint>
@@ -1409,27 +1411,73 @@
     ina, inb, out);
 }
 
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
 {
+    ARM_COMPUTE_UNUSED(alpha);
+
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32, DataType::QS8, DataType::QS16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-    ARM_COMPUTE_UNUSED(input0);
-    ARM_COMPUTE_UNUSED(input1);
-    ARM_COMPUTE_UNUSED(output);
 
-    if(output->dimension(1) == 1)
+    if(!is_interleaved)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+
+        if(output->total_size() != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));
+            ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
+        }
+    }
+    else
+    {
+        const int m                         = reshape_info.m();
+        const int n                         = reshape_info.n();
+        const int k                         = reshape_info.k();
+        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
+        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+
+        /* Interleave */
+        TensorShape tensor_shape0{ input0->tensor_shape() };
+        tensor_shape0.set(0, k);
+        tensor_shape0.set(1, m);
+
+        const TensorInfo tensor_info0          = input0->clone()->set_tensor_shape(tensor_shape0);
+        const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
+
+        if(n != 0) /* Transpose */
+        {
+            TensorShape tensor_shape1{ input1->tensor_shape() };
+            tensor_shape1.set(0, n);
+            tensor_shape1.set(1, k);
+
+            const TensorInfo tensor_info1          = input1->clone()->set_tensor_shape(tensor_shape1);
+            const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+        }
+
+        if(output->total_size() != 0)
+        {
+            if(n != 0)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n));
+            }
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
+        }
     }
 
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
 {
-    Window win            = Window();
-    bool   window_changed = false;
+    bool   window_changed{};
+    Window win{};
 
     unsigned int       num_elems_processed_per_iteration_x = 0;
     const unsigned int num_elems_processed_per_iteration_y = 4;
@@ -1538,11 +1586,19 @@
 {
 }
 
-void NEGEMMMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha)
+void NEGEMMMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
 {
-    // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
+
+    // Output tensor auto inizialitation if not yet initialized
+    TensorShape tensor_shape{ input0->info()->tensor_shape() };
+    tensor_shape.set(0, is_interleaved ? reshape_info.n() : input1->info()->dimension(0));
+    tensor_shape.set(1, is_interleaved ? reshape_info.m() : input0->info()->dimension(1));
+
+    auto_init_if_empty(*output->info(), input0->info()->clone()->set_tensor_shape(tensor_shape));
+
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), alpha, is_interleaved, reshape_info));
 
     _input0 = input0;
     _input1 = input1;
@@ -1555,9 +1611,10 @@
     INEKernel::configure(win_config.second);
 }
 
-Status NEGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+Status NEGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved,
+                                            const GEMMReshapeInfo &reshape_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, alpha, is_interleaved, reshape_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
 
     return Status{};

diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
index fe79df2..c1e975e 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,65 +39,24 @@
 
 using namespace arm_compute;
 
-NEGEMMMatrixVectorMultiplyKernel::NEGEMMMatrixVectorMultiplyKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+template <typename I0, typename I1, typename O>
+void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &window_in, const Window &window_w, const Window &window_out)
 {
+    ARM_COMPUTE_ERROR("Unsupported data types");
+    ARM_COMPUTE_UNUSED(window_in);
+    ARM_COMPUTE_UNUSED(window_w);
+    ARM_COMPUTE_UNUSED(window_out);
 }
 
-void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
+namespace arm_compute
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-    ARM_COMPUTE_ERROR_ON(input0->info()->dimension(2) != input1->info()->dimension(1));
-
-    _input0 = input0;
-    _input1 = input1;
-    _output = output;
-
-    // Configure kernel window
-    const unsigned int num_elems_read_per_iteration = 4;
-
-    Window win = calculate_max_window(*input0->info(), Steps(num_elems_read_per_iteration));
-
-    AccessWindowHorizontal input0_access(input0->info(), 0, num_elems_read_per_iteration);
-    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_read_per_iteration);
-    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
-
-    update_window_and_padding(win, input0_access, input1_access, output_access);
-
-    _output->info()->set_valid_region(ValidRegion(Coordinates(), _output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-void NEGEMMMatrixVectorMultiplyKernel::run(const Window &window, const ThreadInfo &info)
+template <>
+void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<float, float, float>(const Window &window_in,
+                                                                                   const Window &window_w,
+                                                                                   const Window &window_out)
 {
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-
-    Window window_slice = window.first_slice_window_3D();
-
-    Window window_in(window);
-    Window window_weights(window_slice);
-    Window window_out(window);
-
-    // Setup input0 slice
-    window_in.set(Window::DimX, Window::Dimension(0, _input0->info()->dimension(0), _input0->info()->dimension(0)));
-    window_in.set(Window::DimY, Window::Dimension(0, _input0->info()->dimension(1), 1));
-    window_in.set(Window::DimZ, Window::Dimension(0, _input0->info()->dimension(2), 1));
-
-    // Setup input1 and output slice. Their dimensions are increased in the kernel.
-    window_weights.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_weights.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_weights.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
     Iterator in(_input0, window_in);
-    Iterator in2(_input1, window_weights);
+    Iterator in2(_input1, window_w);
     Iterator out(_output, window_out);
 
     const int input_w          = _input0->info()->dimension(0);
@@ -129,3 +88,163 @@
     },
     in, in2, out);
 }
+
+template <>
+void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<uint8_t, uint8_t, int32_t>(const Window &window_in,
+                                                                                         const Window &window_w,
+                                                                                         const Window &window_out)
+{
+    Iterator in(_input0, window_in);
+    Iterator in2(_input1, window_w);
+    Iterator out(_output, window_out);
+
+    const int input_offset   = -_input0->info()->quantization_info().offset;
+    const int weights_offset = -_input1->info()->quantization_info().offset;
+
+    const int input_w          = _input0->info()->dimension(0);
+    const int input_h          = _input0->info()->dimension(1);
+    const int input_stride_x   = _input0->info()->strides_in_bytes().x();
+    const int weights_stride_x = _input1->info()->strides_in_bytes().x();
+    const int weights_stride_y = _input1->info()->strides_in_bytes().y();
+    const int output_stride_x  = _output->info()->strides_in_bytes().x();
+    const int read_step        = 16 / _input0->info()->element_size();
+
+    const int32x4_t v_input_offset   = vdupq_n_s32(input_offset);
+    const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
+
+    execute_window_loop(window_in, [&](const Coordinates & id)
+    {
+        // Get pointers
+        const uint8_t *const input_ptr   = in.ptr();
+        const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y;
+        auto                 output_ptr  = reinterpret_cast<int32_t *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x);
+
+        int32x4_t row_dot = vdupq_n_s32(0);
+        for(int i = 0; i < input_w; i += read_step)
+        {
+            // Read values
+            const auto input   = vld1q_u8(reinterpret_cast<const uint8_t *>(input_ptr + i * input_stride_x));
+            const auto weights = vld1q_u8(reinterpret_cast<const uint8_t *>(weights_ptr + i * weights_stride_x));
+
+            // Add offsets
+            const int32x4x4_t input_s32 =
+            {
+                {
+                    vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(input))))),
+                    vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(input))))),
+                    vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(input))))),
+                    vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(input)))))
+                }
+            };
+            const int32x4x4_t weights_s32 =
+            {
+                {
+                    vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(weights))))),
+                    vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(weights))))),
+                    vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(weights))))),
+                    vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(weights)))))
+                }
+            };
+
+            // Dot
+            row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[0], weights_s32.val[0]));
+            row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[1], weights_s32.val[1]));
+            row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[2], weights_s32.val[2]));
+            row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[3], weights_s32.val[3]));
+        }
+
+        // Reduction
+        auto temp = vadd_s32(vget_high_s32(row_dot), vget_low_s32(row_dot));
+        temp      = vpadd_s32(temp, temp);
+
+        *output_ptr = vget_lane_s32(temp, 0);
+    },
+    in, in2, out);
+}
+} //namespace arm_compute
+
+NEGEMMMatrixVectorMultiplyKernel::NEGEMMMatrixVectorMultiplyKernel()
+    : _func(nullptr), _input0(nullptr), _input1(nullptr), _output(nullptr), _border_size(0)
+{
+}
+
+BorderSize NEGEMMMatrixVectorMultiplyKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input0->info()->data_type()) && (output->info()->data_type() != DataType::S32));
+    ARM_COMPUTE_ERROR_ON(input0->info()->dimension(2) != input1->info()->dimension(1));
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    // Set appropriate function to run
+    switch(input0->info()->data_type())
+    {
+        case DataType::QASYMM8:
+            _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<uint8_t, uint8_t, int32_t>;
+            break;
+        case DataType::F32:
+            _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<float, float, float>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type");
+    }
+
+    // Configure kernel window
+    const unsigned int num_elems_read_per_iteration = 16 / _input0->info()->element_size();
+
+    const unsigned int border_x = ceil_to_multiple(input0->info()->dimension(0), num_elems_read_per_iteration) - input0->info()->dimension(0);
+    _border_size                = BorderSize(0, border_x);
+
+    Window win = calculate_max_window(*input0->info(), Steps(num_elems_read_per_iteration));
+
+    AccessWindowHorizontal input0_access(input0->info(), 0, num_elems_read_per_iteration);
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_read_per_iteration);
+    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+
+    update_window_and_padding(win, input0_access, input1_access, output_access);
+
+    _output->info()->set_valid_region(ValidRegion(Coordinates(), _output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEGEMMMatrixVectorMultiplyKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Window window_slice = window.first_slice_window_3D();
+
+    Window window_in(window);
+    Window window_weights(window_slice);
+    Window window_out(window);
+
+    // Setup input0 slice
+    window_in.set(Window::DimX, Window::Dimension(0, _input0->info()->dimension(0), _input0->info()->dimension(0)));
+    window_in.set(Window::DimY, Window::Dimension(0, _input0->info()->dimension(1), 1));
+    window_in.set(Window::DimZ, Window::Dimension(0, _input0->info()->dimension(2), 1));
+
+    // Setup input1 and output slice. Their dimensions are increased in the kernel.
+    window_weights.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_weights.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_weights.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    if(_func != nullptr)
+    {
+        (this->*_func)(window_in, window_weights, window_out);
+    }
+}

diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 8eb235b..4fa329b 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include <arm_neon.h>
 #include <cstddef>
 #include <cstdint>
@@ -42,14 +44,34 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
+                          bool has_bias, bool is_fully_connected, bool is_flatten)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
-    ARM_COMPUTE_UNUSED(kernel_dims);
-    ARM_COMPUTE_UNUSED(conv_info);
+
+    if(is_flatten) /* Called by FlattenLayer */
+    {
+        size_t flatten_shape = input->tensor_shape().x() * input->tensor_shape().y() * input->tensor_shape().z();
+        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != flatten_shape);
+    }
+    else if(!is_fully_connected) /* Called by ConvolutionLayer */
+    {
+        std::pair<unsigned int, unsigned int> out_dims = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_dims.width, kernel_dims.height, conv_info);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (input->dimension(2) * kernel_dims.area() + (has_bias ? 1 : 0)));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != (out_dims.first * out_dims.second));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(2) != 1);
+    }
+    else /* Called by FullyConnectedLayer */
+    {
+        const int num_batch_dimensions = std::max(0, static_cast<int>(output->tensor_shape().num_dimensions()) - 1);
+        const int num_input_dimensions = input->tensor_shape().num_dimensions() - num_batch_dimensions;
+
+        TensorInfo expected_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_im2col_shape(input, num_input_dimensions));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
+    }
 
     return Status{};
 }
@@ -68,7 +90,8 @@
                              int                  input_stride_x,
                              int                  input_stride_y,
                              int                  input_stride_z,
-                             int                  fixed_point_position)
+                             int                  fixed_point_position,
+                             int                  pad_value)
 {
     const int kernel_size2 = kernel_width * kernel_height;
     const int x_e          = top_left_x + kernel_width;
@@ -85,12 +108,12 @@
         {
             if((y < 0 || y >= input_h) && has_pads)
             {
-                // All the values will be zeros
+                // All the values will be the offset (will be zeros when not quantized)
                 for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
                 {
-                    *(out_ptr + 0 * kernel_size2) = 0;
-                    *(out_ptr + 1 * kernel_size2) = 0;
-                    *(out_ptr + 2 * kernel_size2) = 0;
+                    *(out_ptr + 0 * kernel_size2) = pad_value;
+                    *(out_ptr + 1 * kernel_size2) = pad_value;
+                    *(out_ptr + 2 * kernel_size2) = pad_value;
                 }
             }
             else
@@ -99,9 +122,9 @@
                 {
                     if((x < 0 || x >= input_w) && has_pads)
                     {
-                        *(out_ptr + 0 * kernel_size2) = 0;
-                        *(out_ptr + 1 * kernel_size2) = 0;
-                        *(out_ptr + 2 * kernel_size2) = 0;
+                        *(out_ptr + 0 * kernel_size2) = pad_value;
+                        *(out_ptr + 1 * kernel_size2) = pad_value;
+                        *(out_ptr + 2 * kernel_size2) = pad_value;
                     }
                     else
                     {
@@ -122,8 +145,8 @@
         {
             if((y < 0 || y >= input_h) && has_pads)
             {
-                // All the values will be zeros
-                memset(out_ptr, 0, kernel_width * sizeof(T));
+                // All the values will be the offset (will be zeros when not quantized)
+                memset(out_ptr, pad_value, kernel_width * sizeof(T));
                 out_ptr += kernel_width;
             }
             else
@@ -132,7 +155,7 @@
                 {
                     if((x < 0 || x >= input_w) && has_pads)
                     {
-                        *out_ptr = 0;
+                        *out_ptr = pad_value;
                     }
                     else
                     {
@@ -174,6 +197,7 @@
     const int input_stride_x = _input->info()->strides_in_bytes().x();
     const int input_stride_y = _input->info()->strides_in_bytes().y();
     const int input_stride_z = _input->info()->strides_in_bytes().z();
+    const int offset         = is_data_type_quantized(_input->info()->data_type()) ? _input->info()->quantization_info().offset : 0;
 
     int pad_left = 0;
     int pad_top  = 0;
@@ -226,7 +250,8 @@
                                       input_stride_x,
                                       input_stride_y,
                                       input_stride_z,
-                                      _input->info()->fixed_point_position());
+                                      _input->info()->fixed_point_position(),
+                                      offset);
     },
     in, out);
 }
@@ -288,12 +313,15 @@
 {
 }
 
-void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
+                               bool has_bias, bool is_fully_connected, bool is_flatten)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias));
+    ARM_COMPUTE_UNUSED(is_fully_connected);
+    ARM_COMPUTE_UNUSED(is_flatten);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten));
 
     _input          = input;
     _output         = output;
@@ -335,6 +363,9 @@
             case DataType::QS16:
                 _func = &NEIm2ColKernel::run_reduced<qint16_t>;
                 break;
+            case DataType::QASYMM8:
+                _func = &NEIm2ColKernel::run_reduced<qasymm8_t>;
+                break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
                 break;
@@ -358,6 +389,9 @@
             case DataType::QS16:
                 _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<qint16_t, false> : &NEIm2ColKernel::run_generic<qint16_t, true>;
                 break;
+            case DataType::QASYMM8:
+                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<qasymm8_t, false> : &NEIm2ColKernel::run_generic<qasymm8_t, true>;
+                break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
                 break;
@@ -373,9 +407,10 @@
     IKernel::configure(window);
 }
 
-Status NEIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+Status NEIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
+                                bool has_bias, bool is_fully_connected, bool is_flatten)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten));
     return Status{};
 }
 

diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
index a81725f..01be36b 100644
--- a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -96,7 +96,6 @@
     // First one will use vector operations, second one processes the left over pixels
     Window window_input(window);
     window_input.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_input.collapse_if_possible(INEKernel::window(), 3);
     window_input.set(3, Window::Dimension(0, 1, 1));
 
     Iterator input(_input, window_input);

diff --git a/src/core/NEON/kernels/NEPermuteKernel.cpp b/src/core/NEON/kernels/NEPermuteKernel.cpp
new file mode 100644
index 0000000..ae1d48c
--- /dev/null
+++ b/src/core/NEON/kernels/NEPermuteKernel.cpp

@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEPermuteKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace
+{
+#include "arm_compute/core/NEON/kernels/convolution/common/shims.hpp"
+} // namespace
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16, DataType::QS16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((perm.num_dimensions() == 3 && !(perm[0] == 2 && perm[1] == 0 && perm[2] == 1) && !(perm[0] == 1 && perm[1] == 2 && perm[2] == 0)),
+                                    "Only [2, 0, 1] and [1, 2, 0] permutation is supported");
+
+    const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
+
+    // Validate configured output
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+template <typename T>
+void NEPermuteKernel::run_permute(const Window &window)
+{
+    // Input window
+    Window window_in = window;
+    window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
+    window_in.set(Window::DimY, Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start()));
+    window_in.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start()));
+    window_in.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start()));
+
+    // Output window
+    Window                  window_out(window);
+    const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
+    for(size_t d = 0; d <= _perm.num_dimensions(); ++d)
+    {
+        window_out.set(d, zero_window);
+    }
+
+    // Create iterators
+    Iterator in(_input, window_in);
+    Iterator out(_output, window_out);
+
+    // CHW -> HWC
+    if((_perm.num_dimensions() == 3) && (_perm[0] == 2) && (_perm[1] == 0) && (_perm[2] == 1))
+    {
+        const int in_row_stride     = _input->info()->strides_in_bytes().y() / sizeof(T);
+        const int in_channel_stride = _input->info()->strides_in_bytes().z() / sizeof(T);
+        const int in_batch_stride   = _input->info()->strides_in_bytes()[3] / sizeof(T);
+
+        const int out_channel_stride = _output->info()->strides_in_bytes().x() / sizeof(T);
+        const int out_col_stride     = _output->info()->strides_in_bytes().y() / sizeof(T);
+        const int out_row_stride     = _output->info()->strides_in_bytes().z() / sizeof(T);
+        const int out_batch_stride   = _output->info()->strides_in_bytes()[3] / sizeof(T);
+
+        const int n_cols     = _input->info()->tensor_shape().x();
+        const int n_rows     = window_in.y().step();
+        const int n_channels = _input->info()->tensor_shape().z();
+        const int n_batches  = _input->info()->tensor_shape()[3];
+
+        execute_window_loop(window_in, [&](const Coordinates & id)
+        {
+            const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride;
+            reorder::nchw_to_nhwc(reinterpret_cast<const T *>(in.ptr()), reinterpret_cast<T *>(out.ptr()) + idx,
+                                  n_batches, n_channels, n_rows, n_cols,
+                                  in_batch_stride, in_channel_stride, in_row_stride,
+                                  out_batch_stride, out_row_stride, out_col_stride);
+        },
+        in, out);
+    }
+    // HWC -> CHW
+    else if((_perm.num_dimensions() == 3) && (_perm[0] == 1) && (_perm[1] == 2) && (_perm[2] == 0))
+    {
+        const int in_col_stride   = _input->info()->strides_in_bytes().y() / sizeof(T);
+        const int in_row_stride   = _input->info()->strides_in_bytes().z() / sizeof(T);
+        const int in_batch_stride = _input->info()->strides_in_bytes()[3] / sizeof(T);
+
+        const int out_col_stride     = _output->info()->strides_in_bytes().x() / sizeof(T);
+        const int out_row_stride     = _output->info()->strides_in_bytes().y() / sizeof(T);
+        const int out_channel_stride = _output->info()->strides_in_bytes().z() / sizeof(T);
+        const int out_batch_stride   = _output->info()->strides_in_bytes()[3] / sizeof(T);
+
+        const int n_channels = _input->info()->tensor_shape().x();
+        const int n_cols     = window_in.y().step();
+        const int n_rows     = _input->info()->tensor_shape().z();
+        const int n_batches  = _input->info()->tensor_shape()[3];
+
+        execute_window_loop(window_in, [&](const Coordinates & id)
+        {
+            const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride;
+            reorder::nhwc_to_nchw(reinterpret_cast<const T *>(in.ptr()), reinterpret_cast<T *>(out.ptr()) + idx,
+                                  n_batches, n_rows, n_cols, n_channels,
+                                  in_batch_stride, in_row_stride, in_col_stride,
+                                  out_batch_stride, out_channel_stride, out_row_stride);
+        },
+        in, out);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unsupported permutation vector");
+    }
+}
+
+NEPermuteKernel::NEPermuteKernel()
+    : _func(), _input(nullptr), _output(nullptr), _perm()
+{
+}
+
+void NEPermuteKernel::configure(const ITensor *input, ITensor *output, const PermutationVector &perm)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input->info(), perm);
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm));
+
+    _input  = input;
+    _output = output;
+    _perm   = perm;
+
+    switch(input->info()->element_size())
+    {
+        case 1:
+            _func = &NEPermuteKernel::run_permute<uint8_t>;
+            break;
+        case 2:
+            _func = &NEPermuteKernel::run_permute<uint16_t>;
+            break;
+        case 4:
+            _func = &NEPermuteKernel::run_permute<uint32_t>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+
+    // The NEPermute doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+    ICPPKernel::configure(win);
+}
+
+Status NEPermuteKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, perm));
+    return Status{};
+}
+
+void NEPermuteKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+    if(_func != nullptr)
+    {
+        (this->*_func)(window);
+    }
+}

diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index ff4802c..b6af517 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp

@@ -60,13 +60,13 @@
 }
 
 template <bool exclude_padding>
-inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
+inline float calculate_avg_scale(const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
     int       start_x = id.x() * stride_x - pad_x;
     int       start_y = id.y() * stride_y - pad_y;
-    const int end_x   = std::min(start_x + pool_size, upper_bound_w);
-    const int end_y   = std::min(start_y + pool_size, upper_bound_h);
+    const int end_x   = std::min(start_x + pool_size_x, upper_bound_w);
+    const int end_y   = std::min(start_y + pool_size_y, upper_bound_h);
     if(exclude_padding)
     {
         start_x = std::max(0, start_x);
@@ -151,27 +151,23 @@
     v = vsetq_lane_u16(elems[7], v, 7);
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h, int pool_size)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h, int pool_size_x)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
 
-    int                 pool_pad_x        = 0;
-    int                 pool_pad_y        = 0;
-    int                 pool_stride_x     = 0;
-    int                 pool_stride_y     = 0;
-    PoolingType         pool_type         = pool_info.pool_type();
-    const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info();
-    const bool          exclude_padding   = pool_info.exclude_padding();
-    const bool          is_global_pooling = pool_info.is_global_pooling();
-    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    PoolingType         pool_type       = pool_info.pool_type();
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
+    const bool          exclude_padding = pool_info.exclude_padding();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
     static const std::set<int> supported_pool_sizes = { 2, 3 };
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
-    ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()) && ((input->data_type() != DataType::F32) && (input->data_type() != DataType::QASYMM8)));
-    ARM_COMPUTE_RETURN_ERROR_ON(!is_global_pooling && (pool_pad_x >= pool_size || pool_pad_y >= pool_size));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()));
+
+    ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size_x) == supported_pool_sizes.end()) && ((input->data_type() != DataType::F32) && (input->data_type() != DataType::QASYMM8))
+                                && (pool_type != PoolingType::MAX));
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_fixed_point(input->data_type()) && pool_stride_x > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(exclude_padding && is_data_type_fixed_point(input->data_type()));
 
@@ -185,148 +181,148 @@
     return Status{};
 }
 
-Status validate_arguments_pool_info(const ITensorInfo *input, const PoolingLayerInfo &pool_info, const unsigned int pool_size)
+Status validate_arguments_pool_info(const unsigned int pool_size_x, const unsigned int pool_size_y)
 {
-    const bool is_global_pooling = pool_info.is_global_pooling();
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()),
-                                    "Global pooling is supported only with rectangular inputs!");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_global_pooling && ((pool_info.pad_stride_info().pad().first >= pool_size) || (pool_info.pad_stride_info().pad().second >= pool_size)),
-                                    "Invalid pool size and pool pad combination!");
+    ARM_COMPUTE_RETURN_ERROR_ON(pool_size_x == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(pool_size_y == 0);
 
     return Status{};
 }
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &num_elems_processed_per_iteration,
                                                         BorderSize &border_size,
-                                                        unsigned int pooled_w, unsigned int pooled_h, int pool_size)
+                                                        unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
 {
     unsigned int        num_elems_read_per_iteration = 0;
     unsigned int        num_elems_horizontal_window  = 0;
-    int                 pool_pad_x                   = 0;
-    int                 pool_pad_y                   = 0;
     int                 pool_stride_x                = 0;
     int                 pool_stride_y                = 0;
     const int           input_width                  = input->dimension(0);
     const int           input_height                 = input->dimension(1);
     const PadStrideInfo pad_stride_info              = pool_info.pad_stride_info();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
-
+    const int  pool_pad_right  = pad_stride_info.pad_right();
+    const int  pool_pad_top    = pad_stride_info.pad_top();
+    const int  pool_pad_left   = pad_stride_info.pad_left();
+    const int  pool_pad_bottom = pad_stride_info.pad_bottom();
+    const bool is_square       = pool_size_x == pool_size_y;
     // Check output dimensions
     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
                                                      input->dimension(1),
-                                                     pool_size,
-                                                     pool_size,
+                                                     pool_size_x,
+                                                     pool_size_y,
                                                      pad_stride_info);
 
-    // Select element size
-    switch(input->data_type())
-    {
-        case DataType::QS8:
-            num_elems_read_per_iteration = 16;
-            switch(pool_size)
-            {
-                case 2:
-                    num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
-                    break;
-                case 3:
-                    num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Pooling size not supported");
-                    break;
-            }
-            num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
-            break;
-        case DataType::QASYMM8:
-            switch(pool_size)
-            {
-                case 2:
-                    num_elems_read_per_iteration      = 16;
-                    num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
-                    num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
-                    break;
-                case 3:
-                    num_elems_read_per_iteration      = 16;
-                    num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
-                    num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
-                    break;
-                default:
-                    num_elems_read_per_iteration      = 1;
-                    num_elems_processed_per_iteration = 1;
-                    num_elems_horizontal_window       = 1;
-                    break;
-            }
-            break;
-        case DataType::QS16:
-            num_elems_read_per_iteration = 8;
-            switch(pool_size)
-            {
-                case 2:
-                    num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
-                    break;
-                case 3:
-                    num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Pooling size not supported");
-            }
-            num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            switch(pool_size)
-            {
-                case 2:
-                    num_elems_read_per_iteration      = 16;
-                    num_elems_processed_per_iteration = 8;
-                    num_elems_horizontal_window       = 8;
-                    break;
-                case 3:
-                    num_elems_read_per_iteration      = 4;
-                    num_elems_processed_per_iteration = 1;
-                    num_elems_horizontal_window       = 1;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Pooling size not supported");
-                    break;
-            }
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
-            switch(pool_size)
-            {
-                case 2:
-                    num_elems_read_per_iteration = 2;
-                    break;
-                case 3:
-                    num_elems_read_per_iteration = 4; // We use vload4 for pooling3
-                    break;
-                case 7:
-                    num_elems_read_per_iteration = 8; // We use vload8 for pooling7
-                    break;
-                default:
-                    num_elems_read_per_iteration = 1; // We use vload4 for poolingN but with a leftover for loop
-                    break;
-            }
-            num_elems_processed_per_iteration = 1;
-            num_elems_horizontal_window       = 1;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Element size not supported");
-            break;
-    }
+    //If it's not squared and optimized will be executed the MxN
+    num_elems_read_per_iteration      = 1;
+    num_elems_processed_per_iteration = 1;
+    num_elems_horizontal_window       = 1;
 
+    if(is_square)
+    {
+        switch(input->data_type())
+        {
+            case DataType::QS8:
+                num_elems_read_per_iteration = 16;
+                switch(pool_size_x)
+                {
+                    case 2:
+                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
+                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
+                        break;
+                    case 3:
+                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
+                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
+                        break;
+                    default:
+                        break;
+                }
+                break;
+            case DataType::QASYMM8:
+                switch(pool_size_x)
+                {
+                    case 2:
+                        num_elems_read_per_iteration      = 16;
+                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
+                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
+                        break;
+                    case 3:
+                        num_elems_read_per_iteration      = 16;
+                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
+                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
+                        break;
+                    default:
+                        break;
+                }
+                break;
+            case DataType::QS16:
+                num_elems_read_per_iteration = 8;
+                switch(pool_size_x)
+                {
+                    case 2:
+                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 4 : 8;
+                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
+                        break;
+                    case 3:
+                        num_elems_horizontal_window       = (pool_stride_x == 2) ? 4 : 8;
+                        num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
+                        break;
+                    default:
+                        break;
+                }
+                break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
+                switch(pool_size_x)
+                {
+                    case 2:
+                        num_elems_read_per_iteration      = 16;
+                        num_elems_processed_per_iteration = 8;
+                        num_elems_horizontal_window       = 8;
+                        break;
+                    case 3:
+                        num_elems_read_per_iteration      = 4;
+                        num_elems_processed_per_iteration = 1;
+                        num_elems_horizontal_window       = 1;
+                        break;
+                    default:
+                        break;
+                }
+                break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+            case DataType::F32:
+                switch(pool_size_x)
+                {
+                    case 2:
+                        num_elems_read_per_iteration = 2;
+                        break;
+                    case 3:
+                        num_elems_read_per_iteration = 4; // We use vload4 for pooling3
+                        break;
+                    case 7:
+                        num_elems_read_per_iteration = 8; // We use vload8 for pooling7
+                        break;
+                    default:
+                        break;
+                }
+                num_elems_processed_per_iteration = 1;
+                num_elems_horizontal_window       = 1;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Element size not supported");
+                break;
+        }
+    }
     // Number of iterations in X dimension
     const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
 
     // Upper limit for the number of right/bottom border elements that are accessed
-    const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
-    const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+    const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
+    const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
 
-    border_size         = BorderSize(pool_pad_y, pool_pad_x);
-    border_size.right   = std::max(upper_bound_w, pool_pad_x);
-    border_size.bottom  = std::max(upper_bound_h, pool_pad_y);
+    border_size         = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
+    border_size.right   = std::max(upper_bound_w, pool_pad_right);
+    border_size.bottom  = std::max(upper_bound_h, pool_pad_bottom);
     bool window_changed = false;
 
     TensorShape output_shape{ input->tensor_shape() };
@@ -335,7 +331,7 @@
     TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
 
     Window             win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
-    AccessWindowStatic input_access(input, -pool_pad_x, -pool_pad_y, input_width + border_size.right, input_height + border_size.bottom);
+    AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
 
     if(output->total_size() != 0)
     {
@@ -354,7 +350,7 @@
 } // namespace
 
 NEPoolingLayerKernel::NEPoolingLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0)
+    : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
 {
 }
 
@@ -374,29 +370,31 @@
     const int           pool_stride_x     = pad_stride_info.stride().first;
 
     // Update pool size in case of global pooling
-    const int pool_size = is_global_pooling ? input->info()->dimension(0) : pool_info.pool_size();
+    const int pool_size_x = is_global_pooling ? input->info()->dimension(0) : pool_info.pool_size().width;
+    const int pool_size_y = is_global_pooling ? input->info()->dimension(1) : pool_info.pool_size().height;
 
     // Validate pool info before calling scaled_dimensions
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(input->info(), pool_info, pool_size));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size_x, pool_size_y));
 
     // Check output dimensions
     unsigned int pooled_w, pooled_h;
     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
                                                      input->info()->dimension(1),
-                                                     pool_size,
-                                                     pool_size,
+                                                     pool_size_x,
+                                                     pool_size_y,
                                                      pad_stride_info);
 
     // Output auto initialization if not yet initialized
     auto_init(input->info(), output->info(), pooled_w, pooled_h);
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, pool_size));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, pool_size_x));
 
     // Set instance variables
     _input     = input;
     _output    = output;
     _pool_info = pool_info;
+    _is_square = (pool_size_x == pool_size_y);
 
     // Get data type
     const DataType data_type = input->info()->data_type();
@@ -404,41 +402,63 @@
     // Select appropriate function
     if(data_type == DataType::QS8)
     {
-        switch(pool_size)
+        if(_is_square)
         {
-            case 2:
-                switch(pool_type)
-                {
-                    case PoolingType::AVG:
-                        _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG>;
-                        break;
-                    case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                }
-                break;
-            case 3:
-                switch(pool_type)
-                {
-                    case PoolingType::AVG:
-                        _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
-                        break;
-                    case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                }
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Unsupported pooling size!");
+            switch(pool_size_x)
+            {
+                case 2:
+                    switch(pool_type)
+                    {
+                        case PoolingType::AVG:
+                            _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG>;
+                            break;
+                        case PoolingType::MAX:
+                            _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
+                            break;
+                        default:
+                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                    }
+                    break;
+                case 3:
+                    switch(pool_type)
+                    {
+                        case PoolingType::AVG:
+                            _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
+                            break;
+                        case PoolingType::MAX:
+                            _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+                            break;
+                        default:
+                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                    }
+                    break;
+                default:
+                    switch(pool_type)
+                    {
+                        case PoolingType::MAX:
+                            _func = &NEPoolingLayerKernel::poolingMxN_q8<PoolingType::MAX>;
+                            break;
+                        default:
+                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                    }
+                    break;
+            }
+        }
+        else
+        {
+            switch(pool_type)
+            {
+                case PoolingType::MAX:
+                    _func = &NEPoolingLayerKernel::poolingMxN_q8<PoolingType::MAX>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+            }
         }
     }
     else if(data_type == DataType::QASYMM8)
     {
-        if(pool_size == 2 && pool_stride_x < 3)
+        if(pool_size_x == 2 && pool_stride_x < 3 && _is_square)
         {
             switch(pool_type)
             {
@@ -452,7 +472,7 @@
                     ARM_COMPUTE_ERROR("Unsupported pooling type!");
             }
         }
-        else if(pool_size == 3 && pool_stride_x < 3)
+        else if(pool_size_x == 3 && pool_stride_x < 3 && _is_square)
         {
             switch(pool_type)
             {
@@ -471,10 +491,10 @@
             switch(pool_type)
             {
                 case PoolingType::AVG:
-                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::AVG, false>;
+                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8<PoolingType::AVG, false>;
                     break;
                 case PoolingType::MAX:
-                    _func = &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::MAX>;
+                    _func = &NEPoolingLayerKernel::poolingMxN_qasymm8<PoolingType::MAX>;
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -483,151 +503,227 @@
     }
     else if(data_type == DataType::QS16)
     {
-        switch(pool_size)
+        if(_is_square)
         {
-            case 2:
-                switch(pool_type)
-                {
-                    case PoolingType::AVG:
-                        _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG>;
-                        break;
-                    case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                }
-                break;
-            case 3:
-                switch(pool_type)
-                {
-                    case PoolingType::AVG:
-                        _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
-                        break;
-                    case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                }
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Unsupported pooling size!");
+            switch(pool_size_x)
+            {
+                case 2:
+                    switch(pool_type)
+                    {
+                        case PoolingType::AVG:
+                            _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG>;
+                            break;
+                        case PoolingType::MAX:
+                            _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
+                            break;
+                        default:
+                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                    }
+                    break;
+                case 3:
+                    switch(pool_type)
+                    {
+                        case PoolingType::AVG:
+                            _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
+                            break;
+                        case PoolingType::MAX:
+                            _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
+                            break;
+                        default:
+                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                    }
+                    break;
+                default:
+                    switch(pool_type)
+                    {
+                        case PoolingType::MAX:
+                            _func = &NEPoolingLayerKernel::poolingMxN_q16<PoolingType::MAX>;
+                            break;
+                        default:
+                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                    }
+                    break;
+            }
+        }
+        else
+        {
+            switch(pool_type)
+            {
+                case PoolingType::MAX:
+                    _func = &NEPoolingLayerKernel::poolingMxN_q16<PoolingType::MAX>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+            }
         }
     }
     else if(data_type == DataType::F16)
     {
-        switch(pool_size)
+        if(_is_square)
         {
-            case 2:
-                switch(pool_type)
-                {
-                    case PoolingType::AVG:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, false>;
-                        break;
-                    case PoolingType::L2:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, false>;
-                        break;
-                    case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX, false>;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                }
-                break;
-            case 3:
-                switch(pool_type)
-                {
-                    case PoolingType::AVG:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, false>;
-                        break;
-                    case PoolingType::L2:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, false>;
-                        break;
-                    case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX, false>;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                }
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Unsupported pooling size!");
+            switch(pool_size_x)
+            {
+                case 2:
+                    switch(pool_type)
+                    {
+                        case PoolingType::AVG:
+                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, false>;
+                            break;
+                        case PoolingType::L2:
+                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, false>;
+                            break;
+                        case PoolingType::MAX:
+                            _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX, false>;
+                            break;
+                        default:
+                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                    }
+                    break;
+                case 3:
+                    switch(pool_type)
+                    {
+                        case PoolingType::AVG:
+                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, false>;
+                            break;
+                        case PoolingType::L2:
+                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, false>;
+                            break;
+                        case PoolingType::MAX:
+                            _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX, false>;
+                            break;
+                        default:
+                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                    }
+                    break;
+                default:
+                    switch(pool_type)
+                    {
+                        case PoolingType::AVG:
+                            _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, false>;
+                            break;
+                        case PoolingType::L2:
+                            _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, false>;
+                            break;
+                        case PoolingType::MAX:
+                            _func = &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::MAX, false>;
+                            break;
+                        default:
+                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                    }
+                    break;
+            }
+        }
+        else
+        {
+            switch(pool_type)
+            {
+                case PoolingType::AVG:
+                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, false>;
+                    break;
+                case PoolingType::L2:
+                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, false>;
+                    break;
+                case PoolingType::MAX:
+                    _func = &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::MAX, false>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+            }
         }
     }
     else if(data_type == DataType::F32)
     {
-        switch(pool_size)
+        if(_is_square)
         {
-            case 2:
-                switch(pool_type)
-                {
-                    case PoolingType::AVG:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, false>;
-                        break;
-                    case PoolingType::L2:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, false>;
-                        break;
-                    case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX, false>;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                }
-                break;
-            case 3:
-                switch(pool_type)
-                {
-                    case PoolingType::AVG:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, false>;
-                        break;
-                    case PoolingType::L2:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, false>;
-                        break;
-                    case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX, false>;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                }
-                break;
-            case 7:
-                switch(pool_type)
-                {
-                    case PoolingType::AVG:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, false>;
-                        break;
-                    case PoolingType::L2:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, false>;
-                        break;
-                    case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX, false>;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                }
-                break;
-            default:
-                switch(pool_type)
-                {
-                    case PoolingType::AVG:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, false>;
-                        break;
-                    case PoolingType::L2:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, false>;
-                        break;
-                    case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::MAX, false>;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                }
-                break;
+            switch(pool_size_x)
+            {
+                case 2:
+                    switch(pool_type)
+                    {
+                        case PoolingType::AVG:
+                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, false>;
+                            break;
+                        case PoolingType::L2:
+                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, false>;
+                            break;
+                        case PoolingType::MAX:
+                            _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX, false>;
+                            break;
+                        default:
+                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                    }
+                    break;
+                case 3:
+                    switch(pool_type)
+                    {
+                        case PoolingType::AVG:
+                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, false>;
+                            break;
+                        case PoolingType::L2:
+                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, false>;
+                            break;
+                        case PoolingType::MAX:
+                            _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX, false>;
+                            break;
+                        default:
+                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                    }
+                    break;
+                case 7:
+                    switch(pool_type)
+                    {
+                        case PoolingType::AVG:
+                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, false>;
+                            break;
+                        case PoolingType::L2:
+                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, false>;
+                            break;
+                        case PoolingType::MAX:
+                            _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX, false>;
+                            break;
+                        default:
+                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                    }
+                    break;
+                default:
+                    switch(pool_type)
+                    {
+                        case PoolingType::AVG:
+                            _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, false>;
+                            break;
+                        case PoolingType::L2:
+                            _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, false>;
+                            break;
+                        case PoolingType::MAX:
+                            _func = &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::MAX, false>;
+                            break;
+                        default:
+                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                    }
+                    break;
+            }
+        }
+        else
+        {
+            switch(pool_type)
+            {
+                case PoolingType::AVG:
+                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, false>;
+                    break;
+                case PoolingType::L2:
+                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, false>;
+                    break;
+                case PoolingType::MAX:
+                    _func = &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::MAX, false>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+            }
         }
     }
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size);
+    auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size_x, pool_size_y);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
@@ -640,17 +736,18 @@
 
     const int     fixed_point_position = _input->info()->fixed_point_position();
     constexpr int pool_size            = 2;
-    int           pool_pad_x           = 0;
-    int           pool_pad_y           = 0;
     int           pool_stride_x        = 0;
     int           pool_stride_y        = 0;
-    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    const int     pool_pad_right       = _pool_info.pad_stride_info().pad_right();
+    const int     pool_pad_top         = _pool_info.pad_stride_info().pad_top();
+    const int     pool_pad_left        = _pool_info.pad_stride_info().pad_left();
+    const int     pool_pad_bottom      = _pool_info.pad_stride_info().pad_bottom();
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
-    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
 
-    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
-    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -661,7 +758,7 @@
         if(pooling_type == PoolingType::AVG)
         {
             // Calculate scale
-            const qint8_t   scale     = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+            const qint8_t   scale     = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
             const qint8x8_t scale_vec = vdup_n_qs8(scale);
 
             // Perform pooling
@@ -702,18 +799,19 @@
     Iterator input(_input, window_input);
     Iterator output(_output, window);
 
-    constexpr int pool_size     = 2;
-    int           pool_pad_x    = 0;
-    int           pool_pad_y    = 0;
-    int           pool_stride_x = 0;
-    int           pool_stride_y = 0;
-    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    constexpr int pool_size       = 2;
+    int           pool_stride_x   = 0;
+    int           pool_stride_y   = 0;
+    const int     pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int     pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int     pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int     pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
-    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
-    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
 
     const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
 
@@ -752,7 +850,7 @@
             // Scale lower result
             scale_vector_s16x8<exclude_padding>(res_lower, id, 0, scale_step_x,
                                                 pool_size, upper_bound_w, upper_bound_h,
-                                                pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+                                                pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
             lower_res = vmovn_u16(res_lower);
 
             // Compute upper result for stride_x == 1
@@ -780,7 +878,7 @@
                 // Scale lower result
                 scale_vector_s16x8<exclude_padding>(res_upper, id, 1, 2,
                                                     pool_size, upper_bound_w, upper_bound_h,
-                                                    pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
                 upper_res = vmovn_u16(res_upper);
             }
         }
@@ -817,17 +915,18 @@
 
     const int     fixed_point_position = _input->info()->fixed_point_position();
     constexpr int pool_size            = 2;
-    int           pool_pad_x           = 0;
-    int           pool_pad_y           = 0;
+    const int     pool_pad_right       = _pool_info.pad_stride_info().pad_right();
+    const int     pool_pad_top         = _pool_info.pad_stride_info().pad_top();
+    const int     pool_pad_left        = _pool_info.pad_stride_info().pad_left();
+    const int     pool_pad_bottom      = _pool_info.pad_stride_info().pad_bottom();
     int           pool_stride_x        = 0;
     int           pool_stride_y        = 0;
-    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
-    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
 
-    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
-    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -838,7 +937,7 @@
         if(pooling_type == PoolingType::AVG)
         {
             // Calculate scale
-            const qint16_t   scale     = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+            const qint16_t   scale     = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
             const qint16x4_t scale_vec = vdup_n_qs16(scale);
 
             // Perform pooling
@@ -880,19 +979,20 @@
     Iterator input(_input, window_input);
     Iterator output(_output, window);
 
-    constexpr const int pool_size     = 3;
-    int                 pool_pad_x    = 0;
-    int                 pool_pad_y    = 0;
-    int                 pool_stride_x = 0;
-    int                 pool_stride_y = 0;
-    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    constexpr const int pool_size       = 3;
+    const int           pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int           pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int           pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int           pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
-    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
-    const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
-    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -912,7 +1012,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float       scale   = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            const float       scale   = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
             const float16x4_t scale_v = vdup_n_f16(scale);
             // Perform pooling
             const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
@@ -948,15 +1048,18 @@
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator      input(_input, window_input);
     Iterator      output(_output, window);
-    constexpr int pool_size = 2;
-    int           pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
-    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
+    constexpr int pool_size       = 2;
+    const int     pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int     pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int     pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int     pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int           pool_stride_x, pool_stride_y = 0;
+    std::tie(pool_stride_x, pool_stride_y)     = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
-    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
-    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -975,7 +1078,7 @@
 
         if(pooling_type != PoolingType::MAX)
         {
-            const float       scale   = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            const float       scale   = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
             const float16x8_t scale_v = vdupq_n_f16(scale);
             res                       = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
         }
@@ -1007,18 +1110,19 @@
     Iterator input(_input, window_input);
     Iterator output(_output, window);
 
-    constexpr int pool_size     = 2;
-    int           pool_pad_x    = 0;
-    int           pool_pad_y    = 0;
-    int           pool_stride_x = 0;
-    int           pool_stride_y = 0;
-    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    constexpr int pool_size       = 2;
+    const int     pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int     pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int     pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int     pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int           pool_stride_x   = 0;
+    int           pool_stride_y   = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
-    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
-    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -1037,7 +1141,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            float             scale   = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            float             scale   = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
             const float32x2_t scale_v = vdup_n_f32(scale);
 
             // Perform pooling
@@ -1071,18 +1175,19 @@
 
     const int     fixed_point_position = _input->info()->fixed_point_position();
     constexpr int pool_size            = 3;
-    int           pool_pad_x           = 0;
-    int           pool_pad_y           = 0;
+    const int     pool_pad_right       = _pool_info.pad_stride_info().pad_right();
+    const int     pool_pad_top         = _pool_info.pad_stride_info().pad_top();
+    const int     pool_pad_left        = _pool_info.pad_stride_info().pad_left();
+    const int     pool_pad_bottom      = _pool_info.pad_stride_info().pad_bottom();
     int           pool_stride_x        = 0;
     int           pool_stride_y        = 0;
-    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
-    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
 
-    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
-    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
-    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -1093,7 +1198,7 @@
         if(pooling_type == PoolingType::AVG)
         {
             // Calculate scale
-            const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+            const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
 
             // Perform pooling for stride 2
             const qint8x16_t sum_data  = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
@@ -1144,19 +1249,20 @@
     Iterator input(_input, window_input);
     Iterator output(_output, window);
 
-    constexpr int pool_size     = 3;
-    int           pool_pad_x    = 0;
-    int           pool_pad_y    = 0;
-    int           pool_stride_x = 0;
-    int           pool_stride_y = 0;
-    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    constexpr int pool_size       = 3;
+    const int     pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int     pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int     pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int     pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int           pool_stride_x   = 0;
+    int           pool_stride_y   = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
-    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
-    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
-    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -1217,7 +1323,7 @@
 
                 scale_vector_s16x8<exclude_padding>(res, id, 0, 1,
                                                     pool_size, upper_bound_w, upper_bound_h,
-                                                    pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
                 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(res));
             }
             else
@@ -1225,11 +1331,11 @@
                 // Scale lower result
                 scale_vector_s16x8<exclude_padding>(final_sum.val[0], id, 0, 1,
                                                     pool_size, upper_bound_w, upper_bound_h,
-                                                    pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
                 // Scale lower result
                 scale_vector_s16x8<exclude_padding>(final_sum.val[1], id, 8, 1,
                                                     pool_size, upper_bound_w, upper_bound_h,
-                                                    pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
                 const uint8x16_t res = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
                 vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
             }
@@ -1265,18 +1371,19 @@
 
     const int     fixed_point_position = _input->info()->fixed_point_position();
     constexpr int pool_size            = 3;
-    int           pool_pad_x           = 0;
-    int           pool_pad_y           = 0;
+    const int     pool_pad_right       = _pool_info.pad_stride_info().pad_right();
+    const int     pool_pad_top         = _pool_info.pad_stride_info().pad_top();
+    const int     pool_pad_left        = _pool_info.pad_stride_info().pad_left();
+    const int     pool_pad_bottom      = _pool_info.pad_stride_info().pad_bottom();
     int           pool_stride_x        = 0;
     int           pool_stride_y        = 0;
-    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
-    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
+    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
 
-    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
-    const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
-    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -1287,7 +1394,7 @@
         if(pooling_type == PoolingType::AVG)
         {
             // Calculate scale
-            const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position);
+            const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
 
             // Perform pooling for stride 2
             const qint16x8_t sum_data  = vqaddq_qs16(vqaddq_qs16(top_data, bottom_data), middle_data);
@@ -1333,19 +1440,20 @@
     Iterator input(_input, window_input);
     Iterator output(_output, window);
 
-    constexpr const int pool_size     = 3;
-    int                 pool_pad_x    = 0;
-    int                 pool_pad_y    = 0;
-    int                 pool_stride_x = 0;
-    int                 pool_stride_y = 0;
-    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    constexpr const int pool_size       = 3;
+    const int           pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int           pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int           pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int           pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
-    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
-    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
-    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -1366,7 +1474,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            float             scale   = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            float             scale   = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
             const float32x2_t scale_v = vdup_n_f32(scale);
 
             // Perform pooling
@@ -1400,20 +1508,21 @@
     Iterator input(_input, window_input);
     Iterator output(_output, window);
 
-    constexpr const int pool_size     = 7;
-    int                 pool_pad_x    = 0;
-    int                 pool_pad_y    = 0;
-    int                 pool_stride_x = 0;
-    int                 pool_stride_y = 0;
-    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    constexpr const int pool_size       = 7;
+    const int           pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int           pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int           pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int           pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
     std::array<const uint8_t *, pool_size> input_ptrs{ {} };
     for(int i = 0; i < pool_size; ++i)
     {
-        input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + i));
+        input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
     }
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -1423,7 +1532,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            float             scale   = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            float             scale   = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
             const float32x2_t scale_v = vdup_n_f32(scale);
 
             // Perform pooling
@@ -1476,21 +1585,250 @@
     input, output);
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window &window)
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::poolingMxN_q8(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
 
-    const int pool_size     = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size();
-    int       pool_pad_x    = 0;
-    int       pool_pad_y    = 0;
+    const int pool_size_x   = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
+    const int pool_size_y   = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
+    const int pool_pad_top  = _pool_info.pad_stride_info().pad_top();
+    const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
     int       pool_stride_x = 0;
     int       pool_stride_y = 0;
-    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        qint8x16_t vres = {};
+        qint8_t    res  = {};
+
+        //PoolingType::MAX
+        for(int y = 0; y < pool_size_y; ++y)
+        {
+            int x = 0;
+            for(; x <= (pool_size_x - 16); x += 16)
+            {
+                const qint8x16_t data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
+                                                                                    (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+                vres                  = vmaxq_s8(vres, data);
+            }
+
+            // Leftover for loop
+            for(; x < pool_size_x; ++x)
+            {
+                qint8_t data = *(reinterpret_cast<const qint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+                res          = std::max(res, data);
+            }
+        }
+        //Reduce
+        const qint8x8_t half_vres = vpmax_s8(vget_low_s8(vres), vget_high_s8(vres));
+        res                       = std::max(res, vget_lane_s8(half_vres, 0));
+        res                       = std::max(res, vget_lane_s8(half_vres, 1));
+        res                       = std::max(res, vget_lane_s8(half_vres, 2));
+        res                       = std::max(res, vget_lane_s8(half_vres, 3));
+        res                       = std::max(res, vget_lane_s8(half_vres, 4));
+        res                       = std::max(res, vget_lane_s8(half_vres, 5));
+        res                       = std::max(res, vget_lane_s8(half_vres, 6));
+        res                       = std::max(res, vget_lane_s8(half_vres, 7));
+
+        // Store result
+        *(reinterpret_cast<qint8_t *>(output.ptr())) = res;
+    },
+    input, output);
+}
+
+template <PoolingType pooling_type>
+void NEPoolingLayerKernel::poolingMxN_q16(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    const int pool_size_x   = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
+    const int pool_size_y   = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
+    const int pool_pad_top  = _pool_info.pad_stride_info().pad_top();
+    const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
+    int       pool_stride_x = 0;
+    int       pool_stride_y = 0;
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        qint16x8_t vres = {};
+        qint16_t   res  = {};
+
+        //PoolingType::MAX
+        for(int y = 0; y < pool_size_y; ++y)
+        {
+            int x = 0;
+            for(; x <= (pool_size_x - 8); x += 8)
+            {
+                const qint16x8_t data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
+                                                                                      (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+                vres                  = vmaxq_s16(vres, data);
+            }
+
+            // Leftover for loop
+            for(; x < pool_size_x; ++x)
+            {
+                qint16_t data = *(reinterpret_cast<const qint16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+                res           = std::max(res, data);
+            }
+        }
+        //Reduce
+        const qint16x4_t half_vres = vpmax_s16(vget_low_s16(vres), vget_high_s16(vres));
+        res                        = std::max(res, vget_lane_s16(half_vres, 0));
+        res                        = std::max(res, vget_lane_s16(half_vres, 1));
+        res                        = std::max(res, vget_lane_s16(half_vres, 2));
+        res                        = std::max(res, vget_lane_s16(half_vres, 3));
+
+        // Store result
+        *(reinterpret_cast<qint16_t *>(output.ptr())) = res;
+    },
+    input, output);
+}
+
+template <PoolingType pooling_type, bool exclude_padding>
+void NEPoolingLayerKernel::poolingMxN_f16(const Window &window_input, const Window &window)
+{
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    const int pool_size_x     = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
+    const int pool_size_y     = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
+    const int pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int       pool_stride_x   = 0;
+    int       pool_stride_y   = 0;
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        float16_t   res  = 0.0f;
+        float16x8_t vres = vdupq_n_f16(0.0f);
+
+        if(pooling_type != PoolingType::MAX)
+        {
+            // Calculate scale
+            const float scale = calculate_avg_scale<exclude_padding>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+            // Perform pooling
+
+            for(int y = 0; y < pool_size_y; ++y)
+            {
+                int x = 0;
+                for(; x <= (pool_size_x - 8); x += 8)
+                {
+                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
+                                                                                           (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+
+                    // Get power of 2 in case of l2 pooling and accumulate
+                    if(pooling_type == PoolingType::L2)
+                    {
+                        vres = vaddq_f16(vres, vmulq_f16(data, data));
+                    }
+                    else
+                    {
+                        vres = vaddq_f16(vres, data);
+                    }
+                }
+
+                // Leftover for loop
+                for(; x < pool_size_x; ++x)
+                {
+                    float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+
+                    // Get power of 2 in case of l2 pooling
+                    if(pooling_type == PoolingType::L2)
+                    {
+                        data *= data;
+                    }
+
+                    res += data;
+                }
+            }
+
+            // Reduction
+            float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
+            res += vget_lane_f16(tmp, 0);
+            res += vget_lane_f16(tmp, 1);
+            res += vget_lane_f16(tmp, 2);
+            res += vget_lane_f16(tmp, 3);
+
+            // Divide by scale
+            res *= scale;
+        }
+        else
+        {
+            float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
+            res              = std::numeric_limits<float>::lowest();
+
+            for(int y = 0; y < pool_size_y; ++y)
+            {
+                int x = 0;
+                for(; x <= (pool_size_x - 8); x += 8)
+                {
+                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
+                                                                                           (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+                    vres                   = vmaxq_f16(vres, data);
+                }
+
+                // Leftover for loop
+                for(; x < pool_size_x; ++x)
+                {
+                    const float16_t data = *(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
+                    res                  = std::max(res, data);
+                }
+            }
+
+            float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
+            res             = std::max(res, vget_lane_f16(tmp, 0));
+            res             = std::max(res, vget_lane_f16(tmp, 1));
+            res             = std::max(res, vget_lane_f16(tmp, 2));
+            res             = std::max(res, vget_lane_f16(tmp, 3));
+        }
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            res = std::sqrt(res);
+        }
+
+        // Store result
+        *(reinterpret_cast<float16_t *>(output.ptr())) = res;
+    },
+    input, output);
+
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    ARM_COMPUTE_UNUSED(window_input);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+}
+
+template <PoolingType pooling_type, bool exclude_padding>
+void NEPoolingLayerKernel::poolingMxN_f32(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    const int pool_size_x     = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
+    const int pool_size_y     = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
+    const int pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int       pool_stride_x   = 0;
+    int       pool_stride_y   = 0;
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -1499,18 +1837,18 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            const float scale = calculate_avg_scale<exclude_padding>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
             // Perform pooling
             float32x4_t vres = vdupq_n_f32(0.0f);
 
-            for(int y = 0; y < pool_size; ++y)
+            for(int y = 0; y < pool_size_y; ++y)
             {
                 int x = 0;
-                for(; x <= (pool_size - 4); x += 4)
+                for(; x <= (pool_size_x - 4); x += 4)
                 {
-                    const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() +
-                                                                                       (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+                    const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
+                                                                                       (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
 
                     // Get power of 2 in case of l2 pooling and accumulate
                     if(pooling_type == PoolingType::L2)
@@ -1524,9 +1862,9 @@
                 }
 
                 // Leftover for loop
-                for(; x < pool_size; ++x)
+                for(; x < pool_size_x; ++x)
                 {
-                    float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+                    float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
 
                     // Get power of 2 in case of l2 pooling
                     if(pooling_type == PoolingType::L2)
@@ -1553,23 +1891,23 @@
         }
         else
         {
-            float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::min());
-            res              = std::numeric_limits<float>::min();
+            float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
+            res              = std::numeric_limits<float>::lowest();
 
-            for(int y = 0; y < pool_size; ++y)
+            for(int y = 0; y < pool_size_y; ++y)
             {
                 int x = 0;
-                for(; x <= (pool_size - 4); x += 4)
+                for(; x <= (pool_size_x - 4); x += 4)
                 {
-                    const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() +
-                                                                                       (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+                    const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
+                                                                                       (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
                     vres                   = vmaxq_f32(vres, data);
                 }
 
                 // Leftover for loop
-                for(; x < pool_size; ++x)
+                for(; x < pool_size_x; ++x)
                 {
-                    const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+                    const float data = *(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
                     res              = std::max(res, data);
                 }
             }
@@ -1598,20 +1936,22 @@
 }
 
 template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingN_qasymm8(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_qasymm8(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
 
-    const int pool_size     = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size();
-    int       pool_pad_x    = 0;
-    int       pool_pad_y    = 0;
-    int       pool_stride_x = 0;
-    int       pool_stride_y = 0;
-    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    const int pool_size_x     = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
+    const int pool_size_y     = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
+    const int pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int       pool_stride_x   = 0;
+    int       pool_stride_y   = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -1623,24 +1963,25 @@
             uint32_t   sres = 0;
 
             // Calculate scale
-            const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            const float scale = calculate_avg_scale<exclude_padding>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
             // Perform pooling
-            for(int y = 0; y < pool_size; ++y)
+            for(int y = 0; y < pool_size_y; ++y)
             {
                 int x = 0;
-                for(; x <= (pool_size - 8); x += 8)
+                for(; x <= (pool_size_x - 8); x += 8)
                 {
-                    const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+                    const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
+                                                                                     (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
 
                     const uint16x8_t data_u16 = vmovl_u8(data);
                     vres                      = vaddq_u32(vres, vaddl_u16(vget_high_u16(data_u16), vget_low_u16(data_u16)));
                 }
 
                 // Leftover for loop
-                for(; x < pool_size; ++x)
+                for(; x < pool_size_x; ++x)
                 {
-                    uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+                    uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
                     sres += data;
                 }
             }
@@ -1657,19 +1998,20 @@
             uint8x8_t vres = vdup_n_u8(0);
             res            = 0;
 
-            for(int y = 0; y < pool_size; ++y)
+            for(int y = 0; y < pool_size_y; ++y)
             {
                 int x = 0;
-                for(; x <= (pool_size - 8); x += 8)
+                for(; x <= (pool_size_x - 8); x += 8)
                 {
-                    const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+                    const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
+                                                                                     (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
                     vres                 = vmax_u8(vres, data);
                 }
 
                 // Leftover for loop
-                for(; x < pool_size; ++x)
+                for(; x < pool_size_x; ++x)
                 {
-                    const uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+                    const uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
                     res                = std::max(res, data);
                 }
             }
@@ -1699,20 +2041,23 @@
     BorderSize   border_size(0);
 
     const bool         is_global_pooling = pool_info.is_global_pooling();
-    const unsigned int pool_size         = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size();
+    const unsigned int pool_size_x       = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size().width;
+    const unsigned int pool_size_y       = is_global_pooling ? input->tensor_shape().y() : pool_info.pool_size().height;
 
-    // Validate pool info befor calling scaled_dimensions
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(input, pool_info, pool_size));
+    // Validate pool info before calling scaled_dimensions
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
 
     // Check output dimensions
     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
                                                      input->dimension(1),
-                                                     pool_size,
-                                                     pool_size,
+                                                     pool_size_x,
+                                                     pool_size_y,
                                                      pool_info.pad_stride_info());
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, pool_size));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h, pool_size).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, pool_size_x));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h,
+                                                              pool_size_x, pool_size_y)
+                                .first);
 
     return Status{};
 }
@@ -1726,7 +2071,7 @@
 
     const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
     const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
-    const unsigned int pool_size     = _pool_info.pool_size();
+    const unsigned int pool_size     = _pool_info.pool_size().width;
 
     // Set step for input in x and y direction for the input
     Window       window_input(window);

diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
index bff79f0..767af08 100644
--- a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,13 +77,11 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
     Window window_input_output(window);
-    window_input_output.collapse_if_possible(INEKernel::window(), 3);
     window_input_output.set(3, Window::Dimension(0, 1, 1));
 
     Window window_min_max;
     window_min_max.use_tensor_dimensions(_min_max->info()->tensor_shape());
     window_min_max.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_min_max.collapse_if_possible(INEKernel::window(), 1);
 
     Iterator input(_input, window_input_output);
     Iterator output(_output, window_input_output);

diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
index 9b8d931..66115bb 100644
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ b/src/core/NEON/kernels/NERemapKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,7 @@
 {
 inline int32x4_t offset_nearest_interpolation(const float *mapx_ptr, const float *mapy_ptr, const float32x4_t &width, const float32x4_t &height, const int32x4_t &stride)
 {
-    static const float32x4_t lowerxy = vdupq_n_f32(-1.0f);
+    const float32x4_t lowerxy = vdupq_n_f32(-1.f);
 
     float32x4_t x = vld1q_f32(mapx_ptr);
     float32x4_t y = vld1q_f32(mapy_ptr);
@@ -113,11 +113,10 @@
     AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
 
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mapx_access(map_x->info(), 0, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mapy_access(map_y->info(), 0, 0, num_elems_processed_per_iteration);
 
-    update_window_and_padding(win, input_access,
-                              AccessWindowRectangle(map_x->info(), 0, 0, num_elems_processed_per_iteration, 1),
-                              AccessWindowRectangle(map_y->info(), 0, 0, num_elems_processed_per_iteration, 1),
-                              output_access);
+    update_window_and_padding(win, input_access, mapx_access, mapy_access, output_access);
 
     output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
@@ -152,27 +151,24 @@
         const int32x4_t offset2 = offset_nearest_interpolation(mapx_ptr + 8, mapy_ptr + 8, width, height, in_stride);
         const int32x4_t offset3 = offset_nearest_interpolation(mapx_ptr + 12, mapy_ptr + 12, width, height, in_stride);
 
-        uint8x8_t tmp0 = vdup_n_u8(0);
-        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 0)], tmp0, 0);
-        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 1)], tmp0, 1);
-        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 2)], tmp0, 2);
-        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 3)], tmp0, 3);
-        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 0)], tmp0, 4);
-        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 1)], tmp0, 5);
-        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 2)], tmp0, 6);
-        tmp0           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 3)], tmp0, 7);
-
-        uint8x8_t tmp1 = vdup_n_u8(0);
-        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset2, 0)], tmp1, 0);
-        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset2, 1)], tmp1, 1);
-        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset2, 2)], tmp1, 2);
-        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset2, 3)], tmp1, 3);
-        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset3, 0)], tmp1, 4);
-        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset3, 1)], tmp1, 5);
-        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset3, 2)], tmp1, 6);
-        tmp1           = vset_lane_u8(in_ptr[vgetq_lane_s32(offset3, 3)], tmp1, 7);
-
-        vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
+        uint8x16_t tmp = vdupq_n_u8(0);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 0)], tmp, 0);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 1)], tmp, 1);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 2)], tmp, 2);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 3)], tmp, 3);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 0)], tmp, 4);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 1)], tmp, 5);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 2)], tmp, 6);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 3)], tmp, 7);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 0)], tmp, 8);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 1)], tmp, 9);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 2)], tmp, 10);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 3)], tmp, 11);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 0)], tmp, 12);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 1)], tmp, 13);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 2)], tmp, 14);
+        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 3)], tmp, 15);
+        vst1q_u8(out.ptr(), tmp);
     },
     in, out, mapx, mapy);
 }

diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
index a0f324e..45ba68d 100644
--- a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,6 @@
     const TensorShape &output_shape = output->info()->tensor_shape();
     Coordinates        output_coord{};
 
-    window.collapse_if_possible(window, 3);
     Iterator in(input, window);
 
     execute_window_loop(window, [&](const Coordinates & id)

diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index b13fb0e..13d87a0 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,285 +33,433 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/utility.h"
 
 #include <algorithm>
 #include <arm_neon.h>
 #include <cfloat>
+#include <functional>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+template <typename T, int N>
+struct vec_n_type;
+
+#define DECLARE_NEON_VEC_TYPE(T, N, V) \
+    template <>                        \
+    struct vec_n_type<T, N>            \
+    {                                  \
+        using type = V;                \
+    };
+
+DECLARE_NEON_VEC_TYPE(uint8_t, 16, uint8x16_t)
+DECLARE_NEON_VEC_TYPE(uint8_t, 8, uint8x8_t)
+
+DECLARE_NEON_VEC_TYPE(int8_t, 16, int8x16_t)
+DECLARE_NEON_VEC_TYPE(int8_t, 8, int8x8_t)
+
+DECLARE_NEON_VEC_TYPE(uint16_t, 8, uint16x8_t)
+DECLARE_NEON_VEC_TYPE(uint16_t, 4, uint16x4_t)
+
+DECLARE_NEON_VEC_TYPE(int16_t, 8, int16x8_t)
+DECLARE_NEON_VEC_TYPE(int16_t, 4, int16x4_t)
+
+DECLARE_NEON_VEC_TYPE(int32_t, 4, int32x4_t)
+DECLARE_NEON_VEC_TYPE(int32_t, 2, int32x2_t)
+
+DECLARE_NEON_VEC_TYPE(uint32_t, 4, uint32x4_t)
+DECLARE_NEON_VEC_TYPE(uint32_t, 2, uint32x2_t)
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+DECLARE_NEON_VEC_TYPE(float16_t, 8, float16x8_t)
+DECLARE_NEON_VEC_TYPE(float16_t, 4, float16x4_t)
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+DECLARE_NEON_VEC_TYPE(float, 4, float32x4_t)
+DECLARE_NEON_VEC_TYPE(float, 2, float32x2_t)
+
+template <typename T, int N>
+using vec_n_t = typename vec_n_type<T, N>::type;
+
+template <typename T, int N>
+using vec_n_byte_t = vec_n_t < T, N / sizeof(T) >;
+
+template <typename T>
+using vec_16_byte_t = vec_n_byte_t<T, 16>;
+
+template <typename T>
+using vec_8_byte_t = vec_n_byte_t<T, 8>;
+
+template <typename T>
+using const_ptr_t = const T *;
+
+template <typename T>
+using ptr_t = T *;
+
+#define FORWARD_DECLARE_VGET_LANE_FOR_TYPE(TYPE) \
+    template <int lane>                          \
+    TYPE vget_lane(vec_8_byte_t<TYPE> vec);      \
+    template <int lane>                          \
+    TYPE vget_lane(vec_16_byte_t<TYPE> vec);
+
+FORWARD_DECLARE_VGET_LANE_FOR_TYPE(uint8_t)
+FORWARD_DECLARE_VGET_LANE_FOR_TYPE(int8_t)
+FORWARD_DECLARE_VGET_LANE_FOR_TYPE(uint16_t)
+FORWARD_DECLARE_VGET_LANE_FOR_TYPE(int16_t)
+FORWARD_DECLARE_VGET_LANE_FOR_TYPE(uint32_t)
+FORWARD_DECLARE_VGET_LANE_FOR_TYPE(int32_t)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+FORWARD_DECLARE_VGET_LANE_FOR_TYPE(float16_t)
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+FORWARD_DECLARE_VGET_LANE_FOR_TYPE(float)
+template <int lane>
+float vget_lane(float32x4x4_t vec);
+
+template <typename V>
+using elem_type_t = decltype(vget_lane<0>(std::declval<V>()));
+
+template <typename V>
+constexpr size_t vec_size_of(const V &vec)
+{
+    return sizeof(vec) / sizeof(elem_type_t<V>);
+}
+
+template <typename V>
+V vdup_n(elem_type_t<V> val);
+template <typename V>
+V vld(const_ptr_t<elem_type_t<V>> ptr);
+
+#define DECLARE_NEON_FUNCTIONS_FOR_TYPE(TYPE, TAG)                                \
+    template <>                                                                   \
+    inline vec_8_byte_t<TYPE> vdup_n<vec_8_byte_t<TYPE>>(TYPE val)                \
+    {                                                                             \
+        return vdup_n_##TAG(val);                                                 \
+    }                                                                             \
+    template <>                                                                   \
+    inline vec_16_byte_t<TYPE> vdup_n<vec_16_byte_t<TYPE>>(TYPE val)              \
+    {                                                                             \
+        return vdupq_n_##TAG(val);                                                \
+    }                                                                             \
+    template <>                                                                   \
+    inline vec_8_byte_t<TYPE> vld<vec_8_byte_t<TYPE>>(const_ptr_t<TYPE> ptr)      \
+    {                                                                             \
+        return vld1_##TAG(ptr);                                                   \
+    }                                                                             \
+    template <>                                                                   \
+    inline vec_16_byte_t<TYPE> vld<vec_16_byte_t<TYPE>>(const_ptr_t<TYPE> ptr)    \
+    {                                                                             \
+        return vld1q_##TAG(ptr);                                                  \
+    }                                                                             \
+    inline void vst(ptr_t<TYPE> ptr, vec_8_byte_t<TYPE> vec)                      \
+    {                                                                             \
+        vst1_##TAG(ptr, vec);                                                     \
+    }                                                                             \
+    inline void vst(ptr_t<TYPE> ptr, vec_16_byte_t<TYPE> vec)                     \
+    {                                                                             \
+        vst1q_##TAG(ptr, vec);                                                    \
+    }                                                                             \
+    inline vec_16_byte_t<TYPE> vmax(vec_16_byte_t<TYPE> a, vec_16_byte_t<TYPE> b) \
+    {                                                                             \
+        return vmaxq_##TAG(a, b);                                                 \
+    }                                                                             \
+    inline vec_8_byte_t<TYPE> vpmax(vec_8_byte_t<TYPE> a, vec_8_byte_t<TYPE> b)   \
+    {                                                                             \
+        return vpmax_##TAG(a, b);                                                 \
+    }                                                                             \
+    inline vec_8_byte_t<TYPE> vget_low(vec_16_byte_t<TYPE> vec)                   \
+    {                                                                             \
+        return vget_low_##TAG(vec);                                               \
+    }                                                                             \
+    inline vec_8_byte_t<TYPE> vget_high(vec_16_byte_t<TYPE> vec)                  \
+    {                                                                             \
+        return vget_high_##TAG(vec);                                              \
+    }                                                                             \
+    template <int lane>                                                           \
+    inline TYPE vget_lane(vec_8_byte_t<TYPE> vec)                                 \
+    {                                                                             \
+        static_assert(lane >= 0, "lane is out of bounds");                        \
+        static_assert(lane < vec_size_of(vec), "lane is out of bounds");          \
+        return vget_lane_##TAG(vec, lane);                                        \
+    }                                                                             \
+    template <int lane>                                                           \
+    inline TYPE vget_lane(vec_16_byte_t<TYPE> vec)                                \
+    {                                                                             \
+        static_assert(lane >= 0, "lane is out of bounds");                        \
+        static_assert(lane < vec_size_of(vec), "lane is out of bounds");          \
+        return vgetq_lane_##TAG(vec, lane);                                       \
+    }
+
+template <typename T>
+T sqadd(T a, T b);
+template <typename T>
+T sqsub(T a, T b);
+template <typename T>
+T sqmul(T a, T b, int fixed_point_position);
+
+#define DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(TYPET, TYPEU, TAGT, TAGU)                                        \
+    inline vec_8_byte_t<TYPET> vqsub(vec_8_byte_t<TYPET> a, vec_8_byte_t<TYPET> b)                              \
+    {                                                                                                           \
+        return vqsub_##TAGT(a, b);                                                                              \
+    }                                                                                                           \
+    inline vec_8_byte_t<TYPEU> vqadd(vec_8_byte_t<TYPEU> a, vec_8_byte_t<TYPEU> b)                              \
+    {                                                                                                           \
+        return vqadd_##TAGU(a, b);                                                                              \
+    }                                                                                                           \
+    inline vec_16_byte_t<TYPEU> vqadd(vec_16_byte_t<TYPEU> a, vec_16_byte_t<TYPEU> b)                           \
+    {                                                                                                           \
+        return vqaddq_##TAGU(a, b);                                                                             \
+    }                                                                                                           \
+    inline vec_8_byte_t<TYPET> vqexp(vec_8_byte_t<TYPET> vec, int fixed_point_position)                         \
+    {                                                                                                           \
+        return vqexp_q##TAGT(vec, fixed_point_position);                                                        \
+    }                                                                                                           \
+    inline auto vmovl(vec_8_byte_t<TYPET> vec)->decltype(vmovl_##TAGT(vec))                                     \
+    {                                                                                                           \
+        return vmovl_##TAGT(vec);                                                                               \
+    }                                                                                                           \
+    inline vec_16_byte_t<TYPET> vqrecip(vec_16_byte_t<TYPET> vec, int fixed_point_position)                     \
+    {                                                                                                           \
+        return vqrecipq_q##TAGT(vec, fixed_point_position);                                                     \
+    }                                                                                                           \
+    inline vec_16_byte_t<TYPET> vqmul(vec_16_byte_t<TYPET> a, vec_16_byte_t<TYPET> b, int fixed_point_position) \
+    {                                                                                                           \
+        return vqmulq_q##TAGT(a, b, fixed_point_position);                                                      \
+    }                                                                                                           \
+    template <>                                                                                                 \
+    inline TYPEU sqadd<TYPEU>(TYPEU a, TYPEU b)                                                                 \
+    {                                                                                                           \
+        return sqadd_q##TAGU(a, b);                                                                             \
+    }                                                                                                           \
+    inline TYPET sqexp(TYPET val, int fixed_point_position)                                                     \
+    {                                                                                                           \
+        return sqexp_q##TAGT(val, fixed_point_position);                                                        \
+    }                                                                                                           \
+    template <>                                                                                                 \
+    inline TYPET sqsub<TYPET>(TYPET a, TYPET b)                                                                 \
+    {                                                                                                           \
+        return sqsub_q##TAGT(a, b);                                                                             \
+    }                                                                                                           \
+    template <>                                                                                                 \
+    inline TYPET sqmul<TYPET>(TYPET a, TYPET b, int fixed_point_position)                                       \
+    {                                                                                                           \
+        return sqmul_q##TAGT(a, b, fixed_point_position);                                                       \
+    }
+
+#define DECLARE_NEON_FUNCTIONS_FOR_FLOAT(TYPE, TAG)                               \
+    inline vec_8_byte_t<TYPE> vadd(vec_8_byte_t<TYPE> a, vec_8_byte_t<TYPE> b)    \
+    {                                                                             \
+        return vadd_##TAG(a, b);                                                  \
+    }                                                                             \
+    inline vec_16_byte_t<TYPE> vadd(vec_16_byte_t<TYPE> a, vec_16_byte_t<TYPE> b) \
+    {                                                                             \
+        return vaddq_##TAG(a, b);                                                 \
+    }                                                                             \
+    inline vec_16_byte_t<TYPE> vsub(vec_16_byte_t<TYPE> a, vec_16_byte_t<TYPE> b) \
+    {                                                                             \
+        return vsubq_##TAG(a, b);                                                 \
+    }                                                                             \
+    inline vec_16_byte_t<TYPE> vexp(vec_16_byte_t<TYPE> vec)                      \
+    {                                                                             \
+        return vexpq_##TAG(vec);                                                  \
+    }                                                                             \
+    inline vec_16_byte_t<TYPE> vmul_n(vec_16_byte_t<TYPE> vec, TYPE val)          \
+    {                                                                             \
+        return vmulq_n_##TAG(vec, val);                                           \
+    }
+
+DECLARE_NEON_FUNCTIONS_FOR_TYPE(uint8_t, u8)
+DECLARE_NEON_FUNCTIONS_FOR_TYPE(int8_t, s8)
+DECLARE_NEON_FUNCTIONS_FOR_TYPE(uint16_t, u16)
+DECLARE_NEON_FUNCTIONS_FOR_TYPE(int16_t, s16)
+DECLARE_NEON_FUNCTIONS_FOR_TYPE(uint32_t, u32)
+DECLARE_NEON_FUNCTIONS_FOR_TYPE(int32_t, s32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+DECLARE_NEON_FUNCTIONS_FOR_TYPE(float16_t, f16)
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+DECLARE_NEON_FUNCTIONS_FOR_TYPE(float, f32)
+
+DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(int8_t, int16_t, s8, s16)
+DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(int16_t, int32_t, s16, s32)
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+DECLARE_NEON_FUNCTIONS_FOR_FLOAT(float16_t, f16)
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+DECLARE_NEON_FUNCTIONS_FOR_FLOAT(float, f32)
+
+template <typename VO, typename VI>
+VO vcvt(VI vec);
+
+template <>
+float32x4x4_t vcvt<float32x4x4_t>(uint8x16_t vec)
+{
+    const auto    low  = vmovl_u8(vget_low(vec));
+    const auto    high = vmovl_u8(vget_high(vec));
+    float32x4x4_t res  = { {
+            vcvtq_f32_u32(vmovl_u16(vget_low(low))),
+            vcvtq_f32_u32(vmovl_u16(vget_high(low))),
+            vcvtq_f32_u32(vmovl_u16(vget_low(high))),
+            vcvtq_f32_u32(vmovl_u16(vget_high(high)))
+        }
+    };
+    return res;
+}
+
+template <>
+uint8x16_t vcvt<uint8x16_t>(float32x4x4_t vec)
+{
+    uint16x8x2_t resU16 = { {
+            vcombine_u16(vqmovn_u32(vcvtq_u32_f32(vec.val[0])),
+            vqmovn_u32(vcvtq_u32_f32(vec.val[1]))),
+            vcombine_u16(vqmovn_u32(vcvtq_u32_f32(vec.val[2])),
+            vqmovn_u32(vcvtq_u32_f32(vec.val[3])))
+        }
+    };
+
+    uint8x16_t res = vcombine_u8(vqmovn_u16(resU16.val[0]), vqmovn_u16(resU16.val[1]));
+    return res;
+}
+
+float32x4x4_t vexp(float32x4x4_t vec)
+{
+    float32x4x4_t res = { {
+            vexpq_f32(vec.val[0]),
+            vexpq_f32(vec.val[1]),
+            vexpq_f32(vec.val[2]),
+            vexpq_f32(vec.val[3])
+        }
+    };
+    return res;
+}
+
+template <>
+float32x4x4_t vdup_n<float32x4x4_t>(float val)
+{
+    float32x4x4_t res = { {
+            vdupq_n_f32(val),
+            vdupq_n_f32(val),
+            vdupq_n_f32(val),
+            vdupq_n_f32(val)
+        }
+    };
+    return res;
+}
+
+float32x4x4_t vmul_n(float32x4x4_t vec, float val)
+{
+    float32x4x4_t res = { {
+            vmulq_n_f32(vec.val[0], val),
+            vmulq_n_f32(vec.val[1], val),
+            vmulq_n_f32(vec.val[2], val),
+            vmulq_n_f32(vec.val[3], val)
+        }
+    };
+    return res;
+}
+
+float32x4x4_t vadd(float32x4x4_t a, float32x4x4_t b)
+{
+    float32x4x4_t res = { {
+            vaddq_f32(a.val[0], b.val[0]),
+            vaddq_f32(a.val[1], b.val[1]),
+            vaddq_f32(a.val[2], b.val[2]),
+            vaddq_f32(a.val[3], b.val[3])
+        }
+    };
+    return res;
+}
 
 namespace
 {
-Status validate_arguments_logits_1d_max(const ITensorInfo *input, const ITensorInfo *output)
+Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        // Softmax across the x dimension
-        TensorShape output_shape{ input->tensor_shape() };
-        output_shape.set(0, 1);
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window_logits_1d_max(ITensorInfo *input, ITensorInfo *output)
-{
-    // Configure kernel window
-    constexpr unsigned int num_elems_written_per_row = 1;
-    const int              input_width               = input->valid_region().shape.x();
-
-    unsigned int           num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
-    Window                 win                               = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    bool                   window_changed = false;
-
-    if(output->total_size() != 0)
-    {
-        AccessWindowHorizontal output_access(output, 0, num_elems_written_per_row, 1.f / input_width);
-        window_changed = update_window_and_padding(win, input_access, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-    else
-    {
-        window_changed = update_window_and_padding(win, input_access);
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-Status validate_arguments_logits_1d_shift_exp_sum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum, float beta)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, max, sum, output);
-    ARM_COMPUTE_RETURN_ERROR_ON((beta != 1.0f) && is_data_type_fixed_point(input->data_type()));
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
-    }
-
-    // Checks performed when sum is configured
-    if(sum->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max, sum);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, max, sum);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window_logits_1d_shift_exp_sum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum)
-{
-    unsigned int num_elems_processed_per_iteration = input->valid_region().shape.x();
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal max_access(max, 0, 1);
-    AccessWindowHorizontal sum_access(sum, 0, 1);
-    bool                   window_changed = false;
-
-    if(output->total_size() != 0)
-    {
-        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-        window_changed = update_window_and_padding(win, input_access, max_access, output_access, sum_access);
-        output_access.set_valid_region(win, input->valid_region());
-    }
-    else
-    {
-        window_changed = update_window_and_padding(win, input_access, max_access, sum_access);
-    }
-
-    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-Status validate_arguments_logits_1d_norm(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum);
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window_logits_1d_norm(ITensorInfo *input, ITensorInfo *sum, ITensorInfo *output)
-{
-    // Configure kernel window
-    unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
-    Window       win                               = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     sum_access(sum, 0, 0, 1, sum->dimension(1));
-    bool                   window_changed = false;
-
-    if(output->total_size() != 0)
-    {
-        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-        window_changed = update_window_and_padding(win, input_access, sum_access, output_access);
-
-        output_access.set_valid_region(win, input->valid_region());
-    }
-    else
-    {
-        window_changed = update_window_and_padding(win, input_access, sum_access);
-    }
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-void logits_1d_max_qs8(const ITensor *in, ITensor *out, const Window &window)
-{
-    Window in_slice = window.first_slice_window_1D();
-
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window max_slice = window_max.first_slice_window_1D();
-
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator output(out, max_slice);
-
-        qint8x16_t vec_max = vdupq_n_s8(std::numeric_limits<qint8_t>::lowest());
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto       in_ptr        = reinterpret_cast<const qint8_t *>(input.ptr());
-            const qint8x16_t current_value = vld1q_qs8(in_ptr);
-            vec_max                        = vmaxq_qs8(vec_max, current_value);
-        },
-        input);
-
-        qint8x8_t carry_max = vpmax_qs8(vget_high_s8(vec_max), vget_low_s8(vec_max));
-        carry_max           = vpmax_qs8(carry_max, carry_max);
-        carry_max           = vpmax_qs8(carry_max, carry_max);
-        carry_max           = vpmax_qs8(carry_max, carry_max);
-
-        *(reinterpret_cast<qint8_t *>(output.ptr())) = vget_lane_s8(carry_max, 0);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
-}
-void logits_1d_max_qs16(const ITensor *in, ITensor *out, const Window &window)
-{
-    Window in_slice = window.first_slice_window_1D();
-
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window max_slice = window_max.first_slice_window_1D();
-
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator output(out, max_slice);
-
-        qint16x8_t vec_max = vdupq_n_qs16(std::numeric_limits<qint16_t>::lowest());
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto       in_ptr        = reinterpret_cast<const qint16_t *>(input.ptr());
-            const qint16x8_t current_value = vld1q_qs16(in_ptr);
-            vec_max                        = vmaxq_qs16(vec_max, current_value);
-        },
-        input);
-
-        qint16x4_t carry_max = vpmax_qs16(vget_high_qs16(vec_max), vget_low_qs16(vec_max));
-        carry_max            = vpmax_qs16(carry_max, carry_max);
-        carry_max            = vpmax_qs16(carry_max, carry_max);
-
-        *(reinterpret_cast<qint16_t *>(output.ptr())) = vget_lane_s16(carry_max, 0);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
-}
-
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void logits_1d_max_f16(const ITensor *in, ITensor *out, const Window &window)
-{
-    Window in_slice = window.first_slice_window_1D();
-
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window max_slice = window_max.first_slice_window_1D();
-
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator output(out, max_slice);
-
-        float16x8_t vec_max = vdupq_n_f16(std::numeric_limits<float16_t>::lowest());
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto        in_ptr        = reinterpret_cast<const float16_t *>(input.ptr());
-            const float16x8_t current_value = vld1q_f16(in_ptr);
-            vec_max                         = vmaxq_f16(vec_max, current_value);
-        },
-        input);
-
-        float16x4_t carry_max = vpmax_f16(vget_high_f16(vec_max), vget_low_f16(vec_max));
-        carry_max             = vpmax_f16(carry_max, carry_max);
-        carry_max             = vpmax_f16(carry_max, carry_max);
-
-        *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(carry_max, 0);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
-}
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F32);
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-void logits_1d_max_f32(const ITensor *in, ITensor *out, const Window &window)
-{
-    Window in_slice = window.first_slice_window_1D();
-
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window max_slice = window_max.first_slice_window_1D();
-
-    do
+    // Validate in case of configured output
+    if(output.total_size() != 0)
     {
-        Iterator input(in, in_slice);
-        Iterator output(out, max_slice);
-
-        float32x4_t vec_max = vdupq_n_f32(-FLT_MAX);
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto        in_ptr        = reinterpret_cast<const float *>(input.ptr());
-            const float32x4_t current_value = vld1q_f32(in_ptr);
-            vec_max                         = vmaxq_f32(vec_max, current_value);
-        },
-        input);
-
-        float32x2_t carry_max = vpmax_f32(vget_high_f32(vec_max), vget_low_f32(vec_max));
-        carry_max             = vpmax_f32(carry_max, carry_max);
-
-        *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(carry_max, 0);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1));
     }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_logits_1d_max(ITensorInfo &input, ITensorInfo &output)
+{
+    // Softmax across the x dimension
+    const TensorShape output_shape = TensorShape(input.tensor_shape()).set(0, 1);
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(output, output_shape, 1, input.data_type(), input.fixed_point_position(), input.quantization_info());
+
+    // Configure kernel window
+    const int input_width                       = input.valid_region().shape.x();
+    const int num_elems_processed_per_iteration = 16U / data_size_from_type(input.data_type());
+    const int num_elems_read_per_iteration      = ceil_to_multiple(input_width, num_elems_processed_per_iteration);
+
+    const ValidRegion out_valid_region(ValidRegion(input.valid_region()).set(0, 0, 1));
+    output.set_valid_region(out_valid_region);
+
+    Window win = calculate_max_window(output);
+
+    AccessWindowHorizontal input_access(&input, input.valid_region().anchor.x(), num_elems_read_per_iteration);
+    AccessWindowHorizontal output_access(&output, 0, 1);
+
+    const bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+    const Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+template <typename V>
+auto reduce_max(V vec) -> elem_type_t<V>
+{
+    constexpr int N = vec_size_of(vec);
+
+    auto carry_max = vpmax(vget_high(vec), vget_low(vec));
+
+    for(int k = N / 2; k > 1; k /= 2)
+    {
+        carry_max = vpmax(carry_max, carry_max);
+    }
+
+    return vget_lane<0>(carry_max);
+}
+
+template <typename T>
+void logits_1d_max(const ITensor &in, ITensor &out, const Window &window)
+{
+    const auto   start_x     = in.info()->valid_region().anchor.x();
+    const size_t input_width = in.info()->valid_region().shape.x();
+
+    Iterator input(&in, window);
+    Iterator output(&out, window);
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        // Get pointers
+        const auto in_ptr  = reinterpret_cast<const T *>(input.ptr()) + start_x;
+        const auto out_ptr = reinterpret_cast<T *>(output.ptr());
+
+        // Init max value
+        auto vec_max = vdup_n<vec_16_byte_t<T>>(std::numeric_limits<T>::lowest());
+
+        // Loop over input row
+        for(const T *it = in_ptr; it < (in_ptr + input_width); it += vec_size_of(vec_max))
+        {
+            const auto current_value = vld<vec_16_byte_t<T>>(it);
+            vec_max                  = vmax(vec_max, current_value);
+        }
+
+        const T max_val = reduce_max(vec_max);
+        *out_ptr        = max_val;
+    },
+    input, output);
 }
 } // namespace
 
@@ -328,54 +476,54 @@
 void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Softmax across the x dimension
-    TensorShape output_shape{ input->info()->tensor_shape() };
-    output_shape.set(0, 1);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
-
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), output->info());
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(input->info(), output->info()));
-
-    const int    input_width                       = input->info()->valid_region().shape.x();
-    unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type());
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*input->info(), *output->info()));
+    // Configure kernel window
+    auto win_config = validate_and_configure_window_logits_1d_max(*input->info(), *output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     switch(input->info()->data_type())
     {
+        case DataType::QASYMM8:
+            _func = &logits_1d_max<qasymm8_t>;
+            break;
         case DataType::QS8:
-            _func = &logits_1d_max_qs8;
+            _func = &logits_1d_max<qint8_t>;
             break;
         case DataType::QS16:
-            _func = &logits_1d_max_qs16;
+            _func = &logits_1d_max<qint16_t>;
             break;
-        case DataType::F32:
-            _func = &logits_1d_max_f32;
-            break;
-        case DataType::F16:
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            _func = &logits_1d_max_f16;
+        case DataType::F16:
+            _func = &logits_1d_max<float16_t>;
             break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        case DataType::F32:
+            _func = &logits_1d_max<float>;
+            break;
         default:
             ARM_COMPUTE_ERROR("Unsupported data type.");
     }
 
-    _input       = input;
-    _output      = output;
-    _border_size = BorderSize(0, num_elems_processed_per_iteration - (input_width % num_elems_processed_per_iteration), 0, 0);
+    _input  = input;
+    _output = output;
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window_logits_1d_max(input->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    const int input_width                       = input->info()->valid_region().shape.x();
+    const int num_elems_processed_per_iteration = 16U / data_size_from_type(input->info()->data_type());
+    const int num_elems_read_per_iteration      = ceil_to_multiple(input_width, num_elems_processed_per_iteration);
+
+    _border_size = BorderSize(0, num_elems_read_per_iteration - input_width, 0, 0);
+
     INEKernel::configure(win_config.second);
 }
 
 Status NELogits1DMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_max(input->clone().get(), output->clone().get()).first);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*input, *output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_max(*input->clone(), *output->clone()).first);
 
     return Status{};
 }
@@ -387,297 +535,393 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(_input, _output, window);
+    (*_func)(*_input, *_output, window);
 }
 
 namespace
 {
-void logits_1d_shift_exp_sum_qs8(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta)
+Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensorInfo &max,
+                                         const ITensorInfo &output, const float beta, const ITensorInfo &tmp)
 {
-    ARM_COMPUTE_UNUSED(beta);
-
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-    Window max_slice = window_max.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
-
-    constexpr int step                 = 8;
-    const int     long_steps           = in->info()->valid_region().shape.x() / step;
-    const int     small_steps          = in->info()->valid_region().shape.x() % step;
-    const int     fixed_point_position = in->info()->fixed_point_position();
-
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator exp(out, in_slice);
-        Iterator _max(max, max_slice);
-        Iterator _sum(sum, max_slice);
-
-        // Get pointers
-        auto in_ptr  = reinterpret_cast<const qint8_t *>(input.ptr());
-        auto exp_ptr = reinterpret_cast<qint8_t *>(exp.ptr());
-
-        // Init sum to zero
-        qint16x8_t vec_sum_value = vdupq_n_qs16(0);
-
-        // Get max value
-        const auto      max_ptr = reinterpret_cast<const qint8_t *>(_max.ptr());
-        const qint8x8_t vec_max = vdup_n_qs8(*max_ptr);
-
-        // Run neon loop
-        for(int i = 0; i < long_steps; ++i)
-        {
-            qint8x8_t vec_elements = vld1_qs8(in_ptr);
-            vec_elements           = vqsub_qs8(vec_elements, vec_max);
-            vec_elements           = vqexp_qs8(vec_elements, fixed_point_position);
-
-            vst1_qs8(exp_ptr, vec_elements);
-            vec_sum_value = vqaddq_qs16(vec_sum_value, vmovl_s8(vec_elements));
-
-            in_ptr += step;
-            exp_ptr += step;
-        }
-        // Reduce sum
-        const qint16x4_t sum_red = vqadd_qs16(vget_low_s16(vec_sum_value), vget_high_s16(vec_sum_value));
-        const qint16_t   sum0    = sqadd_qs16(vget_lane_s16(sum_red, 0), vget_lane_s16(sum_red, 1));
-        const qint16_t   sum1    = sqadd_qs16(vget_lane_s16(sum_red, 2), vget_lane_s16(sum_red, 3));
-        qint16_t         sum     = sqadd_qs16(sum0, sum1);
-
-        // Run remaining elements
-        for(int i = 0; i < small_steps; ++i)
-        {
-            qint8_t element = sqexp_qs8(sqsub_qs8(in_ptr[i], *max_ptr), fixed_point_position);
-            exp_ptr[i]      = element;
-            sum             = sqadd_qs16(sum, element);
-        }
-
-        *(reinterpret_cast<qint8_t *>(_sum.ptr())) = sqmovn_qs16(sum);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
-}
-void logits_1d_shift_exp_sum_qs16(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta)
-{
-    ARM_COMPUTE_UNUSED(beta);
-
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-    Window max_slice = window_max.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
-
-    constexpr int step                 = 4;
-    const int     long_steps           = in->info()->valid_region().shape.x() / step;
-    const int     small_steps          = in->info()->valid_region().shape.x() % step;
-    const int     fixed_point_position = in->info()->fixed_point_position();
-
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator exp(out, in_slice);
-        Iterator _max(max, max_slice);
-        Iterator _sum(sum, max_slice);
-
-        // Get pointers
-        auto in_ptr  = reinterpret_cast<const qint16_t *>(input.ptr());
-        auto exp_ptr = reinterpret_cast<qint16_t *>(exp.ptr());
-
-        // Init sum to zero
-        qint32x4_t vec_sum_value = vdupq_n_qs32(0);
-
-        // Get max value
-        const auto       max_ptr = reinterpret_cast<const qint16_t *>(_max.ptr());
-        const qint16x4_t vec_max = vdup_n_qs16(*max_ptr);
-
-        // Run neon loop
-        for(int i = 0; i < long_steps; ++i)
-        {
-            qint16x4_t vec_elements = vld1_qs16(in_ptr);
-            vec_elements            = vqsub_qs16(vec_elements, vec_max);
-            vec_elements            = vqexp_qs16(vec_elements, fixed_point_position);
-
-            vst1_qs16(exp_ptr, vec_elements);
-            vec_sum_value = vqaddq_qs32(vec_sum_value, vmovl_s16(vec_elements));
-
-            in_ptr += step;
-            exp_ptr += step;
-        }
-        // Reduce sum
-        qint32x2_t carry_addition = vqadd_qs32(vget_high_s32(vec_sum_value), vget_low_s32(vec_sum_value));
-        qint32_t   sum            = vget_lane_s32(carry_addition, 0) + vget_lane_s32(carry_addition, 1);
-
-        // Run remaining elements
-        for(int i = 0; i < small_steps; ++i)
-        {
-            qint16_t element = sqexp_qs16(sqsub_qs16(in_ptr[i], *max_ptr), fixed_point_position);
-            exp_ptr[i]       = element;
-            sum              = sqadd_qs32(sum, element);
-        }
-
-        *(reinterpret_cast<qint16_t *>(_sum.ptr())) = sqmovn_qs32(sum);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
-}
-
+    // Check input
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void logits_1d_shift_exp_sum_f16(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta)
-{
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-    Window max_slice = window_max.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
-
-    constexpr int step        = 8;
-    const int     long_steps  = in->info()->valid_region().shape.x() / step;
-    const int     small_steps = in->info()->valid_region().shape.x() % step;
-
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator exp(out, in_slice);
-        Iterator _max(max, max_slice);
-        Iterator _sum(sum, max_slice);
-
-        // Get pointers
-        auto in_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        auto exp_ptr = reinterpret_cast<float16_t *>(exp.ptr());
-
-        // Init sum to zero
-        float16x8_t vec_sum_value = vdupq_n_f16(0);
-
-        // Get max value
-        const auto        max_ptr = reinterpret_cast<const float16_t *>(_max.ptr());
-        const float16x8_t vec_max = vdupq_n_f16(*max_ptr);
-
-        // Run neon loop
-        for(int i = 0; i < long_steps; ++i)
-        {
-            float16x8_t vec_elements = vld1q_f16(in_ptr);
-            vec_elements             = vsubq_f16(vec_elements, vec_max);
-            vec_elements             = vmulq_n_f16(vec_elements, beta);
-            vec_elements             = vexpq_f16(vec_elements);
-
-            vst1q_f16(exp_ptr, vec_elements);
-            vec_sum_value = vaddq_f16(vec_sum_value, vec_elements);
-
-            in_ptr += step;
-            exp_ptr += step;
-        }
-        // Reduce sum
-        const float16x4_t sum_red        = vadd_f16(vget_low_f16(vec_sum_value), vget_high_f16(vec_sum_value));
-        const float16x4_t carry_addition = vpadd_f16(sum_red, sum_red);
-        float16_t         sum            = vget_lane_f16(carry_addition, 0) + vget_lane_f16(carry_addition, 1);
-
-        // Run remaining elements
-        for(int i = 0; i < small_steps; ++i)
-        {
-            const float16_t element = std::exp(static_cast<float>(in_ptr[i] - *max_ptr) * beta);
-            exp_ptr[i]              = element;
-            sum += element;
-        }
-        *(reinterpret_cast<float16_t *>(_sum.ptr())) = sum;
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
-}
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F32);
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta)
-{
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input.data_type());
 
-    Window max_slice = window_max.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
+    // Check max
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &max);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(input.tensor_shape()).set(0, 1), max.tensor_shape());
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &max);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &max);
 
-    constexpr int step        = 4;
-    const int     long_steps  = in->info()->valid_region().shape.x() / step;
-    const int     small_steps = in->info()->valid_region().shape.x() % step;
-
-    do
+    // Check output if configured
+    if(output.total_size() != 0)
     {
-        Iterator input(in, in_slice);
-        Iterator exp(out, in_slice);
-        Iterator _max(max, max_slice);
-        Iterator _sum(sum, max_slice);
-
-        // Get pointers
-        auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
-        auto exp_ptr = reinterpret_cast<float *>(exp.ptr());
-
-        // Init sum to zero
-        float32x4_t vec_sum_value = vdupq_n_f32(0.0f);
-
-        // Get max value
-        const auto        max_ptr = reinterpret_cast<const float *>(_max.ptr());
-        const float32x4_t vec_max = vdupq_n_f32(*max_ptr);
-
-        // Run neon loop
-        for(int i = 0; i < long_steps; ++i)
-        {
-            float32x4_t vec_elements = vld1q_f32(in_ptr);
-            vec_elements             = vsubq_f32(vec_elements, vec_max);
-            vec_elements             = vmulq_n_f32(vec_elements, beta);
-            vec_elements             = vexpq_f32(vec_elements);
-
-            vst1q_f32(exp_ptr, vec_elements);
-            vec_sum_value = vaddq_f32(vec_elements, vec_sum_value);
-
-            in_ptr += step;
-            exp_ptr += step;
-        }
-
-        // Reduce sum
-        float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value));
-        carry_addition             = vpadd_f32(carry_addition, carry_addition);
-        float sum                  = vget_lane_f32(carry_addition, 0);
-
-        // Run remaining elements
-        for(int i = 0; i < small_steps; ++i)
-        {
-            float element = std::exp((in_ptr[i] - *max_ptr) * beta);
-            exp_ptr[i]    = element;
-            sum += element;
-        }
-
-        *(reinterpret_cast<float *>(_sum.ptr())) = sum;
+        const QuantizationInfo output_quantization = is_quantized_asymmetric ? QuantizationInfo(1.f / 256.f, 0) : output.quantization_info();
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &output);
+        ARM_COMPUTE_RETURN_ERROR_ON(output.quantization_info() != output_quantization);
     }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
-}
-} //namespace
 
-NELogits1DShiftExpSumKernel::NELogits1DShiftExpSumKernel()
-    : _func(nullptr), _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr), _beta(1.0f)
-{
+    // Check beta
+    ARM_COMPUTE_RETURN_ERROR_ON((beta != 1.0f) && is_data_type_fixed_point(input.data_type()));
+
+    // Check tmp if configured
+    if(tmp.total_size() != 0)
+    {
+        const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : input.data_type();
+        ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &tmp);
+        // We could potentially reduce tmp memory if we could predict or make an assumption
+        // on the maximum number of threads that will run in parallel.
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &tmp);
+    }
+
+    return Status{};
 }
 
-void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum, float beta)
+std::pair<Status, Window> validate_and_configure_window_logits_softmax(ITensorInfo &input, ITensorInfo &max,
+                                                                       ITensorInfo &output, ITensorInfo &tmp)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
+    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input.data_type());
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    const QuantizationInfo output_quantization = is_quantized_asymmetric ? QuantizationInfo(1.f / 256.f, 0) : output.quantization_info();
+    auto_init_if_empty(output, TensorInfo(input).set_quantization_info(output_quantization).reset_padding());
 
+    // Tmp auto initialization if not yet initialized
+    const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : input.data_type();
+    auto_init_if_empty(tmp, TensorInfo(input).set_data_type(tmp_data_type).reset_padding());
+
+    const int input_width = input.valid_region().shape.x();
+
+    Window win = calculate_max_window(max);
+
+    AccessWindowHorizontal input_access(&input, input.valid_region().anchor.x(), input_width);
+    AccessWindowHorizontal max_access(&input, 0, 1);
+    AccessWindowHorizontal output_access(&output, input.valid_region().anchor.x(), input_width);
+    AccessWindowHorizontal tmp_access(&tmp, input.valid_region().anchor.x(), input_width);
+
+    const bool window_changed = update_window_and_padding(win, input_access, max_access, output_access, tmp_access);
+
+    output.set_valid_region(input.valid_region());
+
+    const Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+template <typename T, int N, int S, int E>
+struct reduce_add_impl
+{
+    template <typename F>
+    static T reduce(F add_fn, vec_n_t<T, N> vec)
+    {
+        constexpr int H            = (S + E + 1) / 2;
+        const auto    reduced_high = reduce_add_impl < T, N, S, H - 1 >::reduce(add_fn, vec);
+        const auto    reduced_low  = reduce_add_impl<T, N, H, E>::reduce(add_fn, vec);
+        return add_fn(reduced_high, reduced_low);
+    }
+};
+template <typename T, int N, int I>
+struct reduce_add_impl<T, N, I, I>
+{
+    template <typename F>
+    static T reduce(F /*add_fn*/, vec_n_t<T, N> vec)
+    {
+        return vget_lane<I>(vec);
+    }
+};
+template <typename V, typename F>
+elem_type_t<V> reduce_add(F add_fn, V vec)
+{
+    constexpr int N = vec_size_of(vec);
+    return reduce_add_impl < elem_type_t<V>, N, 0, N - 1 >::reduce(add_fn, vec);
+}
+
+void logits_1d_softmax_qasymm8(const ITensor &in, const ITensor &max, void *const tmp, ITensor &out, const float beta, const Window &window)
+{
+    const int start_x     = in.info()->valid_region().anchor.x();
+    const int input_width = in.info()->valid_region().shape.x();
+
+    const float scale_beta = -beta * in.info()->quantization_info().scale;
+
+    Iterator in_it(&in, window);
+    Iterator max_it(&max, window);
+    Iterator out_it(&out, window);
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        /* Get pointers */
+        const auto in_ptr  = reinterpret_cast<const qasymm8_t *>(in_it.ptr()) + start_x;
+        const auto out_ptr = reinterpret_cast<qasymm8_t *>(out_it.ptr()) + start_x;
+        const auto tmp_ptr = reinterpret_cast<float *>(tmp);
+
+        float sum_inversed;
+
+        /* Compute exponentials and sum */
+        {
+            /* Get max value */
+            const auto max_val = *reinterpret_cast<const qasymm8_t *>(max_it.ptr());
+            const auto vec_max = vdup_n<vec_16_byte_t<qasymm8_t>>(max_val);
+
+            /* Init sum to zero */
+            auto vec_sum = vdup_n<float32x4x4_t>(0.f);
+
+            /* Loop over row and compute exponentials and sum */
+            int           i        = 0;
+            constexpr int vec_size = vec_size_of(vec_max);
+            for(; i <= (input_width - vec_size); i += vec_size)
+            {
+                auto vec_elements = vld<vec_16_byte_t<qasymm8_t>>(in_ptr + i);
+                vec_elements      = vsubq_u8(vec_max, vec_elements);
+
+                auto vec_elements_flt = vcvt<float32x4x4_t>(vec_elements);
+                vec_elements_flt      = vexp(vmul_n(vec_elements_flt, scale_beta));
+
+                vec_sum = vadd(vec_sum, vec_elements_flt);
+
+                vst4q_f32(tmp_ptr + i, vec_elements_flt);
+            }
+            /* Reduce sum */
+            const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]),
+                                               vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
+            const auto sum_8_byte = vadd_f32(vget_low(sum_16_byte), vget_high(sum_16_byte));
+            float      sum        = reduce_add(std::plus<float>(), sum_8_byte);
+
+            /* Run remaining elements */
+            for(; i < input_width; ++i)
+            {
+                const float element = std::exp((max_val - in_ptr[i]) * scale_beta);
+                sum += element;
+                tmp_ptr[i] = element;
+            }
+
+            sum_inversed = 256.f / sum;
+        }
+
+        /* Normalize exponentials */
+        {
+            /* Loop over row and compute softmax */
+            int i = 0;
+            {
+                constexpr int vec_size = 16;
+                for(; i <= (input_width - vec_size); i += vec_size)
+                {
+                    float32x4x4_t vec_in           = vld4q_f32(tmp_ptr + i);
+                    auto          normalized_value = vcvt<vec_16_byte_t<qasymm8_t>>(vmul_n(vec_in, sum_inversed));
+                    vst(out_ptr + i, normalized_value);
+                }
+            }
+            /* Run remaining elements */
+            for(; i < input_width; ++i)
+            {
+                out_ptr[i] = utility::saturate_cast<qasymm8_t>(tmp_ptr[i] * sum_inversed);
+            }
+        }
+    },
+    in_it, max_it, out_it);
+}
+
+template <typename T, typename U>
+void logits_1d_softmax_fixed_point(const ITensor &in, const ITensor &max, void *const tmp,
+                                   ITensor &out, const float /*beta*/, const Window &window)
+{
+    const int start_x     = in.info()->valid_region().anchor.x();
+    const int input_width = in.info()->valid_region().shape.x();
+
+    const int fixed_point_position = in.info()->fixed_point_position();
+
+    Iterator in_it(&in, window);
+    Iterator max_it(&max, window);
+    Iterator out_it(&out, window);
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        /* Get pointers */
+        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
+        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
+        const auto tmp_ptr = reinterpret_cast<T *>(tmp);
+
+        vec_16_byte_t<T> vec_sum_inversed;
+
+        /* Compute exponentials and sum */
+        {
+            /* Get max value */
+            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
+            const auto vec_max = vdup_n<vec_8_byte_t<T>>(max_val);
+
+            /* Init sum to zero */
+            auto vec_sum = vdup_n<vec_16_byte_t<U>>(0);
+
+            /* Loop over row and compute exponentials and sum */
+            int           i        = 0;
+            constexpr int vec_size = vec_size_of(vec_sum);
+            for(; i <= (input_width - vec_size); i += vec_size)
+            {
+                auto vec_elements = vld<vec_8_byte_t<T>>(in_ptr + i);
+                vec_elements      = vqsub(vec_elements, vec_max);
+                vec_elements      = vqexp(vec_elements, fixed_point_position);
+                vec_sum           = vqadd(vec_sum, vmovl(vec_elements));
+                vst(tmp_ptr + i, vec_elements);
+            }
+            /* Reduce sum */
+            const vec_8_byte_t<U> sum_8_byte = vqadd(vget_high(vec_sum), vget_low(vec_sum));
+            U                     sum        = reduce_add(sqadd<U>, sum_8_byte);
+
+            /* Run remaining elements */
+            for(; i < input_width; ++i)
+            {
+                T element  = sqexp(sqsub(in_ptr[i], max_val), fixed_point_position);
+                sum        = sqadd<U>(sum, element);
+                tmp_ptr[i] = element;
+            }
+
+            const auto qsum  = utility::saturate_cast<T>(sum);
+            vec_sum_inversed = vqrecip(vdup_n<vec_16_byte_t<T>>(qsum), fixed_point_position);
+        }
+
+        /* Normalize exponentials */
+        {
+            /* Loop over row and compute softmax */
+            int           i        = 0;
+            constexpr int vec_size = vec_size_of(vec_sum_inversed);
+            for(; i <= (input_width - vec_size); i += vec_size)
+            {
+                const auto             vec_in           = vld<vec_16_byte_t<T>>(tmp_ptr + i);
+                const vec_16_byte_t<T> normalized_value = vqmul(vec_in, vec_sum_inversed, fixed_point_position);
+                vst(out_ptr + i, normalized_value);
+            }
+
+            const T sum_inversed = vget_lane<0>(vec_sum_inversed);
+
+            /* Run remaining elements */
+            for(; i < input_width; ++i)
+            {
+                out_ptr[i] = sqmul(tmp_ptr[i], sum_inversed, fixed_point_position);
+            }
+        }
+    },
+    in_it, max_it, out_it);
+}
+
+template <typename T>
+void logits_1d_softmax_float(const ITensor &in, const ITensor &max, void *const tmp,
+                             ITensor &out, const float beta, const Window &window)
+{
+    const int start_x     = in.info()->valid_region().anchor.x();
+    const int input_width = in.info()->valid_region().shape.x();
+
+    Iterator in_it(&in, window);
+    Iterator max_it(&max, window);
+    Iterator out_it(&out, window);
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        /* Get pointers */
+        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
+        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
+        const auto tmp_ptr = reinterpret_cast<T *>(tmp);
+
+        T sum_inversed;
+
+        /* Compute exponentials and sum */
+        {
+            /* Get max value */
+            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
+            const auto vec_max = vdup_n<vec_16_byte_t<T>>(max_val);
+
+            /* Init sum to zero */
+            auto vec_sum = vdup_n<vec_16_byte_t<T>>(0);
+
+            /* Loop over row and compute exponentials and sum */
+            int           i        = 0;
+            constexpr int vec_size = vec_size_of(vec_sum);
+            for(; i <= (input_width - vec_size); i += vec_size)
+            {
+                auto vec_elements = vld<vec_16_byte_t<T>>(in_ptr + i);
+                vec_elements      = vsub(vec_elements, vec_max);
+                vec_elements      = vexp(vmul_n(vec_elements, beta));
+                vec_sum           = vadd(vec_sum, vec_elements);
+                vst(tmp_ptr + i, vec_elements);
+            }
+            /* Reduce sum */
+            const auto sum_8_byte = vadd(vget_high(vec_sum), vget_low(vec_sum));
+            T sum                 = reduce_add([](T a, T b) -> T { return a + b; }, sum_8_byte);
+
+            /* Run remaining elements */
+            for(; i < input_width; ++i)
+            {
+                T element = std::exp((in_ptr[i] - max_val) * beta);
+                sum += element;
+                tmp_ptr[i] = element;
+            }
+
+            sum_inversed = T(1) / sum;
+        }
+
+        /* Normalize exponentials */
+        {
+            /* Loop over row and compute softmax */
+            int i = 0;
+            {
+                constexpr int vec_size = vec_size_of(vec_16_byte_t<T> {});
+                for(; i <= (input_width - vec_size); i += vec_size)
+                {
+                    auto             vec_in           = vld<vec_16_byte_t<T>>(tmp_ptr + i);
+                    vec_16_byte_t<T> normalized_value = vmul_n(vec_in, sum_inversed);
+                    vst(out_ptr + i, normalized_value);
+                }
+            }
+            /* Run remaining elements */
+            for(; i < input_width; ++i)
+            {
+                out_ptr[i] = tmp_ptr[i] * sum_inversed;
+            }
+        }
+    },
+    in_it, max_it, out_it);
+}
+} // namespace
+
+NELogits1DSoftmaxKernel::NELogits1DSoftmaxKernel()
+    : _func(nullptr), _input(nullptr), _max(nullptr), _output(nullptr), _beta(1.0f), _tmp(nullptr)
+{
+}
+
+void NELogits1DSoftmaxKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, const float beta, ITensor *tmp)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, output, tmp);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), max->info(), output->info(), tmp->info());
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_shift_exp_sum(input->info(), max->info(), output->info(), sum->info(), beta));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*input->info(), *max->info(), *output->info(), beta, *tmp->info()));
+    // Configure kernel window
+    auto win_config = validate_and_configure_window_logits_softmax(*input->info(), *max->info(), *output->info(), *tmp->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     switch(input->info()->data_type())
     {
+        case DataType::QASYMM8:
+            _func = &logits_1d_softmax_qasymm8;
+            break;
         case DataType::QS8:
-            _func = &logits_1d_shift_exp_sum_qs8;
+            _func = &logits_1d_softmax_fixed_point<qint8_t, qint16_t>;
             break;
         case DataType::QS16:
-            _func = &logits_1d_shift_exp_sum_qs16;
+            _func = &logits_1d_softmax_fixed_point<qint16_t, qint32_t>;
             break;
-        case DataType::F32:
-            _func = &logits_1d_shift_exp_sum_f32;
-            break;
-        case DataType::F16:
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            _func = &logits_1d_shift_exp_sum_f16;
+        case DataType::F16:
+            _func = &logits_1d_softmax_float<float16_t>;
             break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        case DataType::F32:
+            _func = &logits_1d_softmax_float<float>;
+            break;
         default:
             ARM_COMPUTE_ERROR("Unsupported data type.");
             break;
@@ -686,224 +930,37 @@
     _input  = input;
     _max    = max;
     _output = output;
-    _sum    = sum;
     _beta   = beta;
+    _tmp    = tmp;
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window_logits_1d_shift_exp_sum(input->info(), max->info(), output->info(), sum->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
 
-Status NELogits1DShiftExpSumKernel::validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum, float beta)
+Status NELogits1DSoftmaxKernel::validate(const ITensorInfo *input, const ITensorInfo *max,
+                                         const ITensorInfo *output, const float beta, const ITensorInfo *tmp)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_shift_exp_sum(input, max, output, sum, beta));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_shift_exp_sum(input->clone().get(), max->clone().get(), output->clone().get(), sum->clone().get()).first);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, output, tmp);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*input, *max, *output, beta, *tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_softmax(*input->clone(), *max->clone(), *output->clone(), *tmp->clone()).first);
 
     return Status{};
 }
 
-void NELogits1DShiftExpSumKernel::run(const Window &window, const ThreadInfo &info)
+void NELogits1DSoftmaxKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(_input, _max, _output, _sum, window, _beta);
+    const unsigned int num_elems_processed_per_iteration = _input->info()->valid_region().shape.x();
+    const unsigned int tmp_size_for_thread               = _tmp->info()->element_size() * num_elems_processed_per_iteration;
+
+    ARM_COMPUTE_ERROR_ON(_tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
+
+    void *tmp_for_thread = _tmp->buffer() + (info.thread_id * tmp_size_for_thread);
+
+    (*_func)(*_input, *_max, tmp_for_thread, *_output, _beta, window);
 }
 
-namespace
-{
-void logits_1d_norm_qs8(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
-{
-    Window window_sum(window);
-    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window sum_slice = window_sum.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
-
-    const int fixed_point_position = in->info()->fixed_point_position();
-
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator _sum(sum, sum_slice);
-        Iterator output(out, in_slice);
-
-        const int8_t     sum_value        = *reinterpret_cast<const qint8_t *>(_sum.ptr());
-        const qint8x16_t vec_sum_inversed = vqrecipq_qs8(vdupq_n_qs8(sum_value), fixed_point_position);
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto in_ptr  = reinterpret_cast<const qint8_t *>(input.ptr());
-            const auto out_ptr = reinterpret_cast<qint8_t *>(output.ptr());
-
-            const qint8x16_t vec_in           = vld1q_qs8(in_ptr);
-            const qint8x16_t normalized_value = vqmulq_qs8(vec_in, vec_sum_inversed, fixed_point_position);
-
-            vst1q_qs8(out_ptr, normalized_value);
-        },
-        input, output);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
-}
-void logits_1d_norm_qs16(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
-{
-    Window window_sum(window);
-    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window sum_slice = window_sum.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
-
-    const int fixed_point_position = in->info()->fixed_point_position();
-
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator _sum(sum, sum_slice);
-        Iterator output(out, in_slice);
-
-        const int16_t    sum_value        = *reinterpret_cast<const qint16_t *>(_sum.ptr());
-        const qint16x8_t vec_sum_inversed = vqrecipq_qs16(vdupq_n_qs16(sum_value), fixed_point_position);
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto in_ptr  = reinterpret_cast<const qint16_t *>(input.ptr());
-            const auto out_ptr = reinterpret_cast<qint16_t *>(output.ptr());
-
-            const qint16x8_t vec_in           = vld1q_qs16(in_ptr);
-            const qint16x8_t normalized_value = vqmulq_qs16(vec_in, vec_sum_inversed, fixed_point_position);
-
-            vst1q_qs16(out_ptr, normalized_value);
-        },
-        input, output);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
-}
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void logits_1d_norm_f16(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
-{
-    Window window_sum(window);
-    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window sum_slice = window_sum.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
-
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator _sum(sum, sum_slice);
-        Iterator output(out, in_slice);
-
-        const float16_t   sum_value        = *reinterpret_cast<const qint16_t *>(_sum.ptr());
-        const float16x8_t vec_sum_inversed = vdupq_n_f16(1.0f / sum_value);
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto in_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-            const auto out_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-            const float16x8_t vec_in           = vld1q_f16(in_ptr);
-            const float16x8_t normalized_value = vmulq_f16(vec_in, vec_sum_inversed);
-
-            vst1q_f16(out_ptr, normalized_value);
-        },
-        input, output);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-void logits_1d_norm_f32(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
-{
-    Window window_sum(window);
-    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window sum_slice = window_sum.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
-
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator _sum(sum, sum_slice);
-        Iterator output(out, in_slice);
-
-        const float       sum_value        = *reinterpret_cast<const float *>(_sum.ptr());
-        const float32x4_t vec_sum_inversed = vdupq_n_f32(1.0f / sum_value);
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
-            const auto out_ptr = reinterpret_cast<float *>(output.ptr());
-
-            const float32x4_t vec_in           = vld1q_f32(in_ptr);
-            const float32x4_t normalized_value = vmulq_f32(vec_in, vec_sum_inversed);
-
-            vst1q_f32(out_ptr, normalized_value);
-        },
-        input, output);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
-}
-} // namespace
-
-NELogits1DNormKernel::NELogits1DNormKernel()
-    : _func(nullptr), _input(nullptr), _sum(nullptr), _output(nullptr)
-{
-}
-
-void NELogits1DNormKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_norm(input->info(), sum->info(), output->info()));
-
-    _input  = input;
-    _sum    = sum;
-    _output = output;
-
-    switch(input->info()->data_type())
-    {
-        case DataType::QS8:
-            _func = &logits_1d_norm_qs8;
-            break;
-        case DataType::QS16:
-            _func = &logits_1d_norm_qs16;
-            break;
-        case DataType::F32:
-            _func = &logits_1d_norm_f32;
-            break;
-        case DataType::F16:
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            _func = &logits_1d_norm_f16;
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-            break;
-    }
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window_logits_1d_norm(input->info(), sum->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-}
-
-Status NELogits1DNormKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_norm(input, sum, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_norm(input->clone().get(), sum->clone().get(), output->clone().get()).first);
-
-    return Status{};
-}
-
-void NELogits1DNormKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (*_func)(_input, _sum, _output, window);
-}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 794c179..1501402 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -101,11 +101,12 @@
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
 
     if(biases != nullptr)
     {
+        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
         ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));

diff --git a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp
index ea48e1f..b2e44f8 100644
--- a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp

@@ -29,121 +29,297 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "support/ToolchainSupport.h"
 
-#include "arm_compute/core/NEON/kernels/winograd/winograd_layer.hpp"
-
-namespace
-{
-using T = WinogradConvolutionLayer<2, 2, 3, 3, float, float>;
-} // namespace
-
 namespace arm_compute
 {
-class Winograd3x3F32::Private
-{
-public:
-    Private(
-        const int          n_batches,         /** Number of batches in the input and output tensors. */
-        const int          n_input_channels,  /** Number of feature maps in a batch of the input tensor. */
-        const int          n_input_rows,      /** Number of rows in a feature map of the input tensor. */
-        const int          n_input_cols,      /** Number of columns in a feature map of the input tensor. */
-        const int          n_output_channels, /** Number of feature maps in the output tensor. */
-        const bool         same_padding,      /** Use "SAME" padding, otherwise use "VALID". */
-        const float *const weights,           /** Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. */
-        float *const       weights_storage,   /** Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size`. */
-        const float *const input,             /** Pointer to NHWC ordered input tensor, in the spatial domain. */
-        float *const       winograd_input,    /** Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. */
-        float *const       output,            /** Pointer to NHWC ordered output tensor, in the spatial domain. */
-        float *const       winograd_output    /** Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */
-    )
-        : convolver(n_batches, n_input_channels, n_input_rows, n_input_cols, n_output_channels, same_padding, weights, weights_storage, input, winograd_input, output, winograd_output)
-    {
-    }
-    T convolver;
-};
-
-Winograd3x3F32::~Winograd3x3F32()
+//Batched Gemms
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerBatchedGEMMKernel()
+    : _gemms()
 {
 }
 
-void Winograd3x3F32::transform_output()
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
+    const unsigned int n_gemms,
+    const int M, const int K, const int N,
+    const int        a_matrix_stride,
+    const int        a_row_stride,
+    const int        b_matrix_stride,
+    const int        b_row_stride,
+    const int        c_matrix_stride,
+    const int        c_row_stride,
+    const TIn *const a_ptr,
+    const TIn *const b_ptr,
+    TOut *const      c_ptr)
 {
-    auto win = _pimpl->convolver.output_transform.get_window();
-    _pimpl->convolver.output_transform.run(0, win);
-}
-
-void Winograd3x3F32::transform_input()
-{
-    auto win = _pimpl->convolver.input_transform.get_window();
-    _pimpl->convolver.input_transform.run(0, win);
-}
-
-void Winograd3x3F32::transform_weights()
-{
-    auto win = _pimpl->convolver.weights_transform.get_window();
-    _pimpl->convolver.weights_transform.run(0, win);
-}
-
-Winograd3x3F32::Winograd3x3F32(
-    const int          n_batches,         /** Number of batches in the input and output tensors. */
-    const int          n_input_channels,  /** Number of feature maps in a batch of the input tensor. */
-    const int          n_input_rows,      /** Number of rows in a feature map of the input tensor. */
-    const int          n_input_cols,      /** Number of columns in a feature map of the input tensor. */
-    const int          n_output_channels, /** Number of feature maps in the output tensor. */
-    const bool         same_padding,      /** Use "SAME" padding, otherwise use "VALID". */
-    const float *const weights,           /** Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. */
-    float *const       weights_storage,   /** Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size`. */
-    const float *const input,             /** Pointer to NHWC ordered input tensor, in the spatial domain. */
-    float *const       winograd_input,    /** Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. */
-    float *const       output,            /** Pointer to NHWC ordered output tensor, in the spatial domain. */
-    float *const       winograd_output    /** Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */
-)
-    : _pimpl(support::cpp14::make_unique<Private>(n_batches, n_input_channels, n_input_rows, n_input_cols, n_output_channels, same_padding, weights, weights_storage, input, winograd_input, output,
-                                                  winograd_output))
-{
-}
-
-unsigned int NEWinogradLayerKernel::get_input_storage_size(const int n_batches, const int n_channels, const int n_rows, const int n_cols, const bool same_padding)
-{
-    return T::get_input_storage_size(n_batches, n_channels, n_rows, n_cols, same_padding);
-}
-
-unsigned int NEWinogradLayerKernel::get_output_storage_size(
-    const int  n_batches,         /** Number of batches in the output tensor. */
-    const int  n_rows,            /** Number of rows in each feature map of the input tensor. */
-    const int  n_cols,            /** Number of columns in each feature map of the input tensor. */
-    const int  n_output_channels, /** Number of feature maps in the output tensor. */
-    const bool same_padding       /** Use "SAME" padding, otherwise use "VALID". */
-)
-{
-    return T::get_output_storage_size(n_batches, n_rows, n_cols, n_output_channels, same_padding);
-}
-
-unsigned int NEWinogradLayerKernel::get_weight_storage_size(const int n_output_channels, const int n_input_channels)
-{
-    return T::get_weight_storage_size(n_output_channels, n_input_channels);
-}
-
-NEWinogradLayerKernel::NEWinogradLayerKernel()
-    : _convolver(nullptr)
-{
-}
-
-void NEWinogradLayerKernel::configure(Winograd3x3F32 *convolver)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(convolver);
-    _convolver = convolver;
+    _gemms = support::cpp14::make_unique<MultiGEMM>(n_gemms, M, K, N, a_matrix_stride, a_row_stride, b_matrix_stride, b_row_stride, c_matrix_stride, c_row_stride, a_ptr, b_ptr, c_ptr);
     Window win;
-    auto   win_last = _convolver->_pimpl->convolver.gemms.get_window();
+    auto   win_last = _gemms->get_window();
     win.set(Window::DimX, Window::Dimension(0, win_last, 1));
     INEKernel::configure(win);
 }
 
-void NEWinogradLayerKernel::run(const Window &window, const ThreadInfo &info)
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     const size_t first_gemm = window.x().start();
     const size_t last_gemm  = window.x().end();
-    _convolver->_pimpl->convolver.gemms.run(first_gemm, last_gemm);
+    _gemms->run(first_gemm, last_gemm);
 }
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+unsigned int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_number_gemms() const
+{
+    return WinogradBase::N_GEMMS;
+}
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_tile_rows() const
+{
+    return _output_tile_rows;
+}
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_tile_cols() const
+{
+    return _output_tile_cols;
+}
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_number_blocks() const
+{
+    return WinogradConv::N_BLOCK;
+}
+
+template class NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>;
+template class NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>;
+
+// Weights transform
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+unsigned int NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_weight_storage_size(int n_output_channels, int n_input_channels) const
+{
+    const KernelShape shape(n_output_channels, KernelRows, KernelCols, n_input_channels);
+    return static_cast<unsigned int>(
+               // WinogradConv returns the size in bytes, we divide by `sizeof(T)` to express that in units of T
+               WinogradConv::get_kernel_storage_size(shape) / sizeof(T));
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformWeightsKernel()
+    : _transform()
+{
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(const KernelShape &kernel_shape) const
+{
+    return WinogradConv::get_kernel_matrix_stride(kernel_shape);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
+    const ITensor *weights_hwio,
+    T *const       output,
+    const int      matrix_stride,     /** Stride across matrices in the output. */
+    const int      n_output_channels, /** Number of filters. */
+    const int      n_input_channels)  /** Number of channels in each filter. */
+{
+    const int matrix_row_stride = roundup(n_output_channels, WinogradConv::N_BLOCK);
+    _transform                  = support::cpp14::make_unique<WeightsTransform>(reinterpret_cast<T *>(weights_hwio->buffer()), output, matrix_stride, matrix_row_stride, n_output_channels,
+                                                                                n_input_channels);
+    Window win;
+    auto   win_last = _transform->get_window();
+    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+    INEKernel::configure(win);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    const size_t fst = window.x().start();
+    const size_t lst = window.x().end();
+    _transform->run(fst, lst);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+bool NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::is_parallelisable() const
+{
+    return false;
+}
+
+template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>;
+template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>;
+
+// Input transform
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+unsigned int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_input_storage_size(
+    int  n_batches,   /** Number of batches in the input tensor. */
+    int  n_channels,  /** Number of feature maps in the input tensor. */
+    int  n_rows,      /** Number of rows in each feature map. */
+    int  n_cols,      /** Number of columns in each feature map. */
+    bool same_padding /** Use "SAME" padding, otherwise use "VALID". */
+) const
+{
+    // Construct shapes for the input and kernel tensors.
+    const Tensor4DShape input_shape(n_batches, n_rows, n_cols, n_channels);
+    const KernelShape   kern_shape(1, KernelRows, KernelCols, n_channels);
+    const PaddingType   padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
+    // Return the size, converted into units of TIn
+    return static_cast<unsigned int>(WinogradConv::get_input_storage_size(kern_shape, input_shape, padding) / sizeof(T));
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
+    const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const
+{
+    return WinogradConv::get_input_matrix_stride(kernel_shape, input_shape, padding_type);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformInputKernel()
+    : _transform()
+{
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
+    const T *const    input,         /** Input tensor data */
+    const int         n_batches,     /** Number of batches in input tensor. */
+    const int         n_rows,        /** Number of rows in input tensor. */
+    const int         n_cols,        /** Number of columns in input tensor. */
+    const int         n_channels,    /** Number of channels in input tensor. */
+    const PaddingType padding,       /** Padding type. */
+    T *const          output,        /** Base of output matrices. */
+    const int         matrix_stride) /** Stride between output matrices. */
+{
+    //  _input_matrix_row_stride(n_input_channels),
+    _transform = support::cpp14::make_unique<InputTransform>(input, n_batches, n_rows, n_cols, n_channels, padding, output, matrix_stride, n_channels);
+    Window win;
+    auto   win_last = _transform->get_window();
+    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+    INEKernel::configure(win);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    const size_t fst = window.x().start();
+    const size_t lst = window.x().end();
+    _transform->run(fst, lst);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+bool NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::is_parallelisable() const
+{
+    return false;
+}
+
+template class NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>;
+template class NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>;
+
+// Output transform
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+unsigned int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_storage_size(
+    int  n_batches,         /** Number of batches in the output tensor. */
+    int  n_rows,            /** Number of rows in each feature map of the input tensor. */
+    int  n_cols,            /** Number of columns in each feature map of the input tensor. */
+    int  n_output_channels, /** Number of feature maps in the output tensor. */
+    bool same_padding       /** Use "SAME" padding, otherwise use "VALID". */
+) const
+{
+    // Construct shapes for the input and kernel tensors.
+    const Tensor4DShape input_shape(n_batches, n_rows, n_cols, 1);
+    const KernelShape   kern_shape(n_output_channels, KernelRows, KernelCols, 1);
+    const PaddingType   padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
+
+    // Return the size, converted into units of TOut
+    return static_cast<unsigned int>(
+               WinogradConv::get_output_storage_size(kern_shape, input_shape, padding) / sizeof(T));
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformOutputKernel()
+    : _biases(nullptr), _output_workspace(nullptr), _matrix_stride(0), _matrix_row_stride(0), _output(nullptr), _n_batches(0), _n_rows(0), _n_cols(0), _n_channels(0)
+{
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
+    const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const
+{
+    return WinogradConv::get_output_matrix_stride(kernel_shape, input_shape, padding_type);
+}
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+Tensor4DShape NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_shape(
+    const KernelShape &kernel_shape, const Tensor4DShape &in_shape, const PaddingType padding) const
+{
+    return WinogradConv::get_output_shape(kernel_shape, in_shape, padding);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
+    const ITensor *biases,
+    const T *const output_workingspace,
+    const int      matrix_stride,
+    T *const       output,
+    const int      n_batches,
+    const int      n_rows,
+    const int      n_cols,
+    const int      n_channels)
+{
+    _biases            = biases;
+    _output_workspace  = output_workingspace;
+    _matrix_stride     = matrix_stride;
+    _matrix_row_stride = roundup(n_channels, WinogradConv::N_BLOCK);
+    _output            = output;
+    _n_batches         = n_batches;
+    _n_rows            = n_rows;
+    _n_cols            = n_cols;
+    _n_channels        = n_channels;
+
+    // We don't have the biases buffer at this stage as it hasn't been allocated, we pass in nullptr OutputTransform is only used here to compute the window
+    OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride, nullptr, _output, _n_batches, _n_rows, _n_cols, _n_channels);
+    Window          win;
+    auto            win_last = output_transform.get_window();
+    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+    INEKernel::configure(win);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_biases->buffer());
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_output_workspace);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_output);
+
+    OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride,
+                                     reinterpret_cast<T *>(_biases->buffer()), _output,
+                                     _n_batches, _n_rows, _n_cols, _n_channels);
+
+    // The code below cannot be moved to configure because biases hasn't been allocated at that point
+    const size_t fst = window.x().start();
+    const size_t lst = window.x().end();
+    output_transform.run(fst, lst);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+bool NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::is_parallelisable() const
+{
+    return false;
+}
+
+template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>;
+template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>;
+
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.cpp
new file mode 100644
index 0000000..0b3212b
--- /dev/null
+++ b/src/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.cpp

@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/convolution/winograd/gemm.hpp"
+} // namespace arm_compute
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+namespace arm_compute
+{
+void NEGEMMAArch64NativeKernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
+                                                   bool is_transposed_1)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+
+    _input0          = input0;
+    _input1          = input1;
+    _output          = output;
+    _workspace       = workspace;
+    _alpha           = alpha;
+    _beta            = beta;
+    _is_transposed_0 = is_transposed_0;
+    _is_transposed_1 = is_transposed_1;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(16U, 4U));
+
+    const int input0_access_end_x = ceil_to_multiple(input0->info()->tensor_shape().x(), 4);
+    const int input0_access_end_y = ceil_to_multiple(input0->info()->tensor_shape().y(), 4);
+    const int input1_access_end_x = ceil_to_multiple(input1->info()->tensor_shape().x(), 16);
+
+    AccessWindowStatic    input0_access(input0->info(), 0, 0, input0_access_end_x, input0_access_end_y);
+    AccessWindowStatic    input1_access(input1->info(), 0, 0, input1_access_end_x, input1->info()->tensor_shape().y());
+    AccessWindowRectangle output_access(output->info(), 0, 0, 16, 4);
+    update_window_and_padding(win, input0_access, input1_access, output_access);
+
+    INEKernel::configure(win);
+}
+
+void NEGEMMAArch64NativeKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_UNUSED(info);
+
+    const auto in1_ptr = reinterpret_cast<const float *>(_input1->buffer());
+
+    // Calculate row strides for each matrix
+    const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float);
+    const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float);
+    const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float);
+
+    // Calculate matrix sizes
+    const int M = std::min(_input0->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
+    const int K = _input0->info()->tensor_shape().x();
+    const int N = _input1->info()->tensor_shape().x();
+
+    // Create window (Only iterate over batches)
+    Window win(window);
+    win.set(0, Window::Dimension(0, 1, 1));
+    win.set(1, Window::Dimension(0, 1, 1));
+
+    // Create Iterators
+    Iterator in0(_input0, window);
+    Iterator out(_output, window);
+
+    // Execute GEMM
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        BlockedGemm<4, 16, float, float>(reinterpret_cast<const float *>(in0.ptr()),
+                                         reinterpret_cast<const float *>(in1_ptr),
+                                         reinterpret_cast<float *>(out.ptr()),
+                                         M, K, N,
+                                         lda, ldb, ldc);
+    },
+    in0, out);
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
index 7827bc1..d4fcf5e 100644
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
+++ b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -102,6 +102,11 @@
                   const ThreadInfo &info, ITensor *ws, int M, int N, int K, bool is_transposed_0, bool is_transposed_1,
                   int lda, int ldb, int ldc, float alpha, float beta)
 {
+    ARM_COMPUTE_UNUSED(M);
+    ARM_COMPUTE_UNUSED(N);
+    ARM_COMPUTE_UNUSED(K);
+    ARM_COMPUTE_UNUSED(is_transposed_0);
+    ARM_COMPUTE_UNUSED(is_transposed_1);
     GemmInterleaved<strategy, typename strategy::operand_type, typename strategy::result_type> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
     void *workspace = align_workspace(gemm, info, ws);
     execute_window_loop(win, [&](const Coordinates & id)

diff --git a/src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp
new file mode 100644
index 0000000..163014b
--- /dev/null
+++ b/src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp

@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wswitch-default"
+#pragma GCC diagnostic ignored "-Weffc++"
+#include "arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp"
+#pragma GCC diagnostic pop
+} // namespace arm_compute
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+namespace arm_compute
+{
+void NEGEMVAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+
+    _input0          = input0;
+    _input1          = input1;
+    _output          = output;
+    _workspace       = workspace;
+    _alpha           = alpha;
+    _beta            = beta;
+    _is_transposed_0 = is_transposed_0;
+    _is_transposed_1 = is_transposed_1;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info());
+
+    AccessWindowRectangle output_access(output->info(), 0, 0, 12, 1);
+
+    const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 12);
+    const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
+
+    update_window_and_padding(win,
+                              AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
+                              AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
+                              output_access);
+
+    INEKernel::configure(win);
+}
+
+void NEGEMVAArch64Kernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const int lda = _input0->info()->strides_in_bytes().y() / sizeof(sgemv_trans::operand_type);
+    const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(sgemv_trans::operand_type);
+    const int ldc = _output->info()->strides_in_bytes().y() / sizeof(sgemv_trans::result_type);
+
+    const auto in1_ptr = reinterpret_cast<const sgemv_trans::operand_type *>(_input1->buffer());
+
+    const int N = _output->info()->tensor_shape().x();
+    const int K = _input0->info()->tensor_shape().x();
+
+    // Only iterate over batches
+    Window win(window);
+    win.set(0, Window::Dimension(0, 1, 1));
+    win.set(1, Window::Dimension(0, 1, 1));
+
+    Iterator in0(_input0, window);
+    Iterator out(_output, window);
+
+    GemvTransposed<sgemv_trans, sgemv_trans::operand_type, sgemv_trans::result_type> gemm(&info.cpu_info, N, K);
+    constexpr size_t alignment      = 4096;
+    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
+    void            *workspace      = _workspace->buffer() + offset;
+    size_t           workspace_size = _workspace->info()->total_size();
+
+    if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
+    {
+        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        gemm.execute(reinterpret_cast<const sgemv_trans::operand_type *>(in0.ptr()), lda,
+                     reinterpret_cast<const sgemv_trans::operand_type *>(in1_ptr), ldb,
+                     reinterpret_cast<sgemv_trans::result_type *>(out.ptr()), ldc,
+                     _alpha, _beta, workspace);
+    },
+    in0, out);
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp b/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp
index 38b9102..e84409c 100644
--- a/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp
+++ b/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,15 +39,13 @@
 
 namespace arm_compute
 {
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wswitch-default"
 #include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
 #include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp"
+#pragma GCC diagnostic pop
 } // namespace arm_compute
 
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
 namespace arm_compute
 {
 void NEHGEMMAArch64FP16Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,

diff --git a/src/core/NEON/kernels/winograd/utils.cpp b/src/core/NEON/kernels/convolution/common/utils.cpp
similarity index 100%
rename from src/core/NEON/kernels/winograd/utils.cpp
rename to src/core/NEON/kernels/convolution/common/utils.cpp


diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
new file mode 100644
index 0000000..fa50f79
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp

@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float>;
+using ConvImpl = DepthwiseConvolutionImpl<2, 2, 3, 3, 1, 1, float, float>;
+
+template <>
+const Conv::TileFn Conv::tile_fns
+  [max_in_pad_top]
+  [max_in_pad_left]
+  [max_in_pad_bottom]
+  [max_in_pad_right]
+  [max_out_pad_bottom]
+  [max_out_pad_right] = {
+  {  // Input pad top = 0
+    {  // Input pad left = 0
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 0, 0, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 0, 0, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 0, 1, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 0, 1, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 0, 2, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 0, 2, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 1, 0, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 1, 0, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 1, 1, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 1, 1, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 1, 2, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 1, 2, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 2, 0, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 2, 0, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 2, 1, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 2, 1, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 2, 2, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 2, 2, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+      },  // Input pad bottom = 2
+    },  // Input pad left = 0
+    {  // Input pad left = 1
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 0, 0, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 0, 0, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 0, 1, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 0, 1, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 0, 2, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 0, 2, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 1, 0, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 1, 0, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 1, 1, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 1, 1, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 1, 2, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 1, 2, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 2, 0, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 2, 0, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 2, 1, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 2, 1, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 2, 2, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 2, 2, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+      },  // Input pad bottom = 2
+    },  // Input pad left = 1
+  },  // Input pad top = 0
+  {  // Input pad top = 1
+    {  // Input pad left = 0
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 0, 0, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 0, 0, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 0, 1, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 0, 1, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 0, 2, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 0, 2, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 1, 0, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 1, 0, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 1, 1, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 1, 1, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 1, 2, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 1, 2, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 2, 0, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 2, 0, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 2, 1, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 2, 1, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 2, 2, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 2, 2, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+      },  // Input pad bottom = 2
+    },  // Input pad left = 0
+    {  // Input pad left = 1
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 0, 0, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 0, 0, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 0, 1, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 0, 1, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 0, 2, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 0, 2, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 1, 0, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 1, 0, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 1, 1, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 1, 1, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 1, 2, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 1, 2, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 2, 0, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 2, 0, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 2, 1, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 2, 1, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 2, 2, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 2, 2, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+      },  // Input pad bottom = 2
+    },  // Input pad left = 1
+  },  // Input pad top = 1
+};
+
+
+template class DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float>;
+}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
new file mode 100644
index 0000000..0ec5a77
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp

@@ -0,0 +1,1095 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float>;
+using ConvImpl = DepthwiseConvolutionImpl<2, 2, 3, 3, 2, 2, float, float>;
+
+template <>
+const Conv::TileFn Conv::tile_fns
+  [max_in_pad_top]
+  [max_in_pad_left]
+  [max_in_pad_bottom]
+  [max_in_pad_right]
+  [max_out_pad_bottom]
+  [max_out_pad_right] = {
+  {  // Input pad top = 0
+    {  // Input pad left = 0
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 3
+      {  // Input pad bottom = 4
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 4
+    },  // Input pad left = 0
+    {  // Input pad left = 1
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 3
+      {  // Input pad bottom = 4
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 4
+    },  // Input pad left = 1
+  },  // Input pad top = 0
+  {  // Input pad top = 1
+    {  // Input pad left = 0
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 3
+      {  // Input pad bottom = 4
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 4
+    },  // Input pad left = 0
+    {  // Input pad left = 1
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 3
+      {  // Input pad bottom = 4
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 0, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 0, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 1, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 1, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 2, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 2, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 3, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 3, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 4, 0, 1>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 4, 1, 1>,
+          },  // Output pad bottom = 1
+        },  // Input pad right = 4
+      },  // Input pad bottom = 4
+    },  // Input pad left = 1
+  },  // Input pad top = 1
+};
+
+
+template class DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float>;
+}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
new file mode 100644
index 0000000..dc3c383
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp

@@ -0,0 +1,1175 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float>;
+using ConvImpl = DepthwiseConvolutionImpl<3, 3, 3, 3, 1, 1, float, float>;
+
+template <>
+const Conv::TileFn Conv::tile_fns
+  [max_in_pad_top]
+  [max_in_pad_left]
+  [max_in_pad_bottom]
+  [max_in_pad_right]
+  [max_out_pad_bottom]
+  [max_out_pad_right] = {
+  {  // Input pad top = 0
+    {  // Input pad left = 0
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+      },  // Input pad bottom = 3
+    },  // Input pad left = 0
+    {  // Input pad left = 1
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+      },  // Input pad bottom = 3
+    },  // Input pad left = 1
+  },  // Input pad top = 0
+  {  // Input pad top = 1
+    {  // Input pad left = 0
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+      },  // Input pad bottom = 3
+    },  // Input pad left = 0
+    {  // Input pad left = 1
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+      },  // Input pad bottom = 3
+    },  // Input pad left = 1
+  },  // Input pad top = 1
+};
+
+
+template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float>;
+}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
new file mode 100644
index 0000000..8d511b1
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp

@@ -0,0 +1,3443 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>;
+using ConvImpl = DepthwiseConvolutionImpl<3, 3, 3, 3, 2, 2, float, float>;
+
+template <>
+const Conv::TileFn Conv::tile_fns
+  [max_in_pad_top]
+  [max_in_pad_left]
+  [max_in_pad_bottom]
+  [max_in_pad_right]
+  [max_out_pad_bottom]
+  [max_out_pad_right] = {
+  {  // Input pad top = 0
+    {  // Input pad left = 0
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 4, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 4, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 4, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 4, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 5, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 5, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 5, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 5, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 5, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 5, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 6, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 6, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 6, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 6, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 6, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 6, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 4, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 4, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 4, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 4, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 5, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 5, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 5, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 5, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 5, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 5, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 6, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 6, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 6, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 6, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 6, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 6, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 4, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 4, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 4, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 4, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 5, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 5, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 5, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 5, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 5, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 5, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 6, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 6, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 6, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 6, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 6, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 6, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 4, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 4, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 4, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 4, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 5, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 5, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 5, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 5, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 5, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 5, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 6, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 6, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 6, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 6, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 6, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 6, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 3
+      {  // Input pad bottom = 4
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 4, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 4, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 4, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 4, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 4, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 4, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 4, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 4, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 4, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 4, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 4, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 4, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 4, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 4, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 4, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 4, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 4, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 4, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 4, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 4, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 4, 0, 1>,
+            Conv::template process_tile<0, 0, 4, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 4, 1, 1>,
+            Conv::template process_tile<0, 0, 4, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 4, 4, 2, 0>,
+            Conv::template process_tile<0, 0, 4, 4, 2, 1>,
+            Conv::template process_tile<0, 0, 4, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 5, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 5, 0, 1>,
+            Conv::template process_tile<0, 0, 4, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 5, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 5, 1, 1>,
+            Conv::template process_tile<0, 0, 4, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 4, 5, 2, 0>,
+            Conv::template process_tile<0, 0, 4, 5, 2, 1>,
+            Conv::template process_tile<0, 0, 4, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 6, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 6, 0, 1>,
+            Conv::template process_tile<0, 0, 4, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 6, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 6, 1, 1>,
+            Conv::template process_tile<0, 0, 4, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 4, 6, 2, 0>,
+            Conv::template process_tile<0, 0, 4, 6, 2, 1>,
+            Conv::template process_tile<0, 0, 4, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 4
+      {  // Input pad bottom = 5
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 5, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 5, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 5, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 5, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 5, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 5, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 5, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 5, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 5, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 5, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 5, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 5, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 5, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 5, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 5, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 5, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 5, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 5, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 5, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 5, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 5, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 5, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 5, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 5, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 5, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 5, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 5, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 5, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 5, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 5, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 5, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 5, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 5, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 5, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 5, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 5, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 5, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 5, 4, 0, 1>,
+            Conv::template process_tile<0, 0, 5, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 5, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 5, 4, 1, 1>,
+            Conv::template process_tile<0, 0, 5, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 5, 4, 2, 0>,
+            Conv::template process_tile<0, 0, 5, 4, 2, 1>,
+            Conv::template process_tile<0, 0, 5, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 5, 5, 0, 0>,
+            Conv::template process_tile<0, 0, 5, 5, 0, 1>,
+            Conv::template process_tile<0, 0, 5, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 5, 5, 1, 0>,
+            Conv::template process_tile<0, 0, 5, 5, 1, 1>,
+            Conv::template process_tile<0, 0, 5, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 5, 5, 2, 0>,
+            Conv::template process_tile<0, 0, 5, 5, 2, 1>,
+            Conv::template process_tile<0, 0, 5, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 5, 6, 0, 0>,
+            Conv::template process_tile<0, 0, 5, 6, 0, 1>,
+            Conv::template process_tile<0, 0, 5, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 5, 6, 1, 0>,
+            Conv::template process_tile<0, 0, 5, 6, 1, 1>,
+            Conv::template process_tile<0, 0, 5, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 5, 6, 2, 0>,
+            Conv::template process_tile<0, 0, 5, 6, 2, 1>,
+            Conv::template process_tile<0, 0, 5, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 5
+      {  // Input pad bottom = 6
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 6, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 6, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 6, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 6, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 6, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 6, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 6, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 6, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 6, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 6, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 6, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 6, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 6, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 6, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 6, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 6, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 6, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 6, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 6, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 6, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 6, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 6, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 6, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 6, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 6, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 6, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 6, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 6, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 6, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 6, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 6, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 6, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 6, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 6, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 6, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 6, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 6, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 6, 4, 0, 1>,
+            Conv::template process_tile<0, 0, 6, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 6, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 6, 4, 1, 1>,
+            Conv::template process_tile<0, 0, 6, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 6, 4, 2, 0>,
+            Conv::template process_tile<0, 0, 6, 4, 2, 1>,
+            Conv::template process_tile<0, 0, 6, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 6, 5, 0, 0>,
+            Conv::template process_tile<0, 0, 6, 5, 0, 1>,
+            Conv::template process_tile<0, 0, 6, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 6, 5, 1, 0>,
+            Conv::template process_tile<0, 0, 6, 5, 1, 1>,
+            Conv::template process_tile<0, 0, 6, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 6, 5, 2, 0>,
+            Conv::template process_tile<0, 0, 6, 5, 2, 1>,
+            Conv::template process_tile<0, 0, 6, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 6, 6, 0, 0>,
+            Conv::template process_tile<0, 0, 6, 6, 0, 1>,
+            Conv::template process_tile<0, 0, 6, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 6, 6, 1, 0>,
+            Conv::template process_tile<0, 0, 6, 6, 1, 1>,
+            Conv::template process_tile<0, 0, 6, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 6, 6, 2, 0>,
+            Conv::template process_tile<0, 0, 6, 6, 2, 1>,
+            Conv::template process_tile<0, 0, 6, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 6
+    },  // Input pad left = 0
+    {  // Input pad left = 1
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 4, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 4, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 4, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 4, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 5, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 5, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 5, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 5, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 5, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 5, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 6, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 6, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 6, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 6, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 6, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 6, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 4, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 4, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 4, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 4, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 5, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 5, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 5, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 5, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 5, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 5, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 6, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 6, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 6, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 6, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 6, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 6, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 4, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 4, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 4, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 4, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 5, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 5, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 5, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 5, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 5, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 5, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 6, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 6, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 6, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 6, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 6, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 6, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 4, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 4, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 4, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 4, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 5, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 5, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 5, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 5, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 5, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 5, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 6, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 6, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 6, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 6, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 6, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 6, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 3
+      {  // Input pad bottom = 4
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 4, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 4, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 4, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 4, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 4, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 4, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 4, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 4, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 4, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 4, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 4, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 4, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 4, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 4, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 4, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 4, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 4, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 4, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 4, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 4, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 4, 0, 1>,
+            Conv::template process_tile<0, 1, 4, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 4, 1, 1>,
+            Conv::template process_tile<0, 1, 4, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 4, 4, 2, 0>,
+            Conv::template process_tile<0, 1, 4, 4, 2, 1>,
+            Conv::template process_tile<0, 1, 4, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 5, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 5, 0, 1>,
+            Conv::template process_tile<0, 1, 4, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 5, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 5, 1, 1>,
+            Conv::template process_tile<0, 1, 4, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 4, 5, 2, 0>,
+            Conv::template process_tile<0, 1, 4, 5, 2, 1>,
+            Conv::template process_tile<0, 1, 4, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 6, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 6, 0, 1>,
+            Conv::template process_tile<0, 1, 4, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 6, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 6, 1, 1>,
+            Conv::template process_tile<0, 1, 4, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 4, 6, 2, 0>,
+            Conv::template process_tile<0, 1, 4, 6, 2, 1>,
+            Conv::template process_tile<0, 1, 4, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 4
+      {  // Input pad bottom = 5
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 5, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 5, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 5, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 5, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 5, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 5, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 5, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 5, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 5, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 5, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 5, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 5, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 5, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 5, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 5, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 5, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 5, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 5, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 5, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 5, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 5, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 5, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 5, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 5, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 5, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 5, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 5, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 5, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 5, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 5, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 5, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 5, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 5, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 5, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 5, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 5, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 5, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 5, 4, 0, 1>,
+            Conv::template process_tile<0, 1, 5, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 5, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 5, 4, 1, 1>,
+            Conv::template process_tile<0, 1, 5, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 5, 4, 2, 0>,
+            Conv::template process_tile<0, 1, 5, 4, 2, 1>,
+            Conv::template process_tile<0, 1, 5, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 5, 5, 0, 0>,
+            Conv::template process_tile<0, 1, 5, 5, 0, 1>,
+            Conv::template process_tile<0, 1, 5, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 5, 5, 1, 0>,
+            Conv::template process_tile<0, 1, 5, 5, 1, 1>,
+            Conv::template process_tile<0, 1, 5, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 5, 5, 2, 0>,
+            Conv::template process_tile<0, 1, 5, 5, 2, 1>,
+            Conv::template process_tile<0, 1, 5, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 5, 6, 0, 0>,
+            Conv::template process_tile<0, 1, 5, 6, 0, 1>,
+            Conv::template process_tile<0, 1, 5, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 5, 6, 1, 0>,
+            Conv::template process_tile<0, 1, 5, 6, 1, 1>,
+            Conv::template process_tile<0, 1, 5, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 5, 6, 2, 0>,
+            Conv::template process_tile<0, 1, 5, 6, 2, 1>,
+            Conv::template process_tile<0, 1, 5, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 5
+      {  // Input pad bottom = 6
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 6, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 6, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 6, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 6, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 6, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 6, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 6, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 6, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 6, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 6, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 6, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 6, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 6, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 6, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 6, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 6, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 6, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 6, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 6, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 6, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 6, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 6, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 6, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 6, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 6, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 6, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 6, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 6, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 6, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 6, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 6, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 6, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 6, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 6, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 6, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 6, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 6, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 6, 4, 0, 1>,
+            Conv::template process_tile<0, 1, 6, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 6, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 6, 4, 1, 1>,
+            Conv::template process_tile<0, 1, 6, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 6, 4, 2, 0>,
+            Conv::template process_tile<0, 1, 6, 4, 2, 1>,
+            Conv::template process_tile<0, 1, 6, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 6, 5, 0, 0>,
+            Conv::template process_tile<0, 1, 6, 5, 0, 1>,
+            Conv::template process_tile<0, 1, 6, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 6, 5, 1, 0>,
+            Conv::template process_tile<0, 1, 6, 5, 1, 1>,
+            Conv::template process_tile<0, 1, 6, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 6, 5, 2, 0>,
+            Conv::template process_tile<0, 1, 6, 5, 2, 1>,
+            Conv::template process_tile<0, 1, 6, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 6, 6, 0, 0>,
+            Conv::template process_tile<0, 1, 6, 6, 0, 1>,
+            Conv::template process_tile<0, 1, 6, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 6, 6, 1, 0>,
+            Conv::template process_tile<0, 1, 6, 6, 1, 1>,
+            Conv::template process_tile<0, 1, 6, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 6, 6, 2, 0>,
+            Conv::template process_tile<0, 1, 6, 6, 2, 1>,
+            Conv::template process_tile<0, 1, 6, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 6
+    },  // Input pad left = 1
+  },  // Input pad top = 0
+  {  // Input pad top = 1
+    {  // Input pad left = 0
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 4, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 4, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 4, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 4, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 5, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 5, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 5, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 5, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 5, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 5, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 6, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 6, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 6, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 6, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 6, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 6, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 4, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 4, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 4, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 4, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 5, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 5, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 5, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 5, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 5, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 5, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 6, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 6, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 6, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 6, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 6, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 6, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 4, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 4, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 4, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 4, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 5, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 5, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 5, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 5, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 5, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 5, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 6, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 6, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 6, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 6, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 6, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 6, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 4, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 4, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 4, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 4, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 5, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 5, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 5, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 5, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 5, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 5, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 6, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 6, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 6, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 6, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 6, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 6, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 3
+      {  // Input pad bottom = 4
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 4, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 4, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 4, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 4, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 4, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 4, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 4, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 4, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 4, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 4, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 4, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 4, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 4, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 4, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 4, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 4, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 4, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 4, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 4, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 4, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 4, 0, 1>,
+            Conv::template process_tile<1, 0, 4, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 4, 1, 1>,
+            Conv::template process_tile<1, 0, 4, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 4, 4, 2, 0>,
+            Conv::template process_tile<1, 0, 4, 4, 2, 1>,
+            Conv::template process_tile<1, 0, 4, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 5, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 5, 0, 1>,
+            Conv::template process_tile<1, 0, 4, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 5, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 5, 1, 1>,
+            Conv::template process_tile<1, 0, 4, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 4, 5, 2, 0>,
+            Conv::template process_tile<1, 0, 4, 5, 2, 1>,
+            Conv::template process_tile<1, 0, 4, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 6, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 6, 0, 1>,
+            Conv::template process_tile<1, 0, 4, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 6, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 6, 1, 1>,
+            Conv::template process_tile<1, 0, 4, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 4, 6, 2, 0>,
+            Conv::template process_tile<1, 0, 4, 6, 2, 1>,
+            Conv::template process_tile<1, 0, 4, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 4
+      {  // Input pad bottom = 5
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 5, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 5, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 5, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 5, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 5, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 5, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 5, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 5, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 5, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 5, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 5, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 5, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 5, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 5, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 5, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 5, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 5, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 5, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 5, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 5, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 5, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 5, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 5, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 5, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 5, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 5, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 5, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 5, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 5, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 5, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 5, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 5, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 5, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 5, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 5, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 5, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 5, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 5, 4, 0, 1>,
+            Conv::template process_tile<1, 0, 5, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 5, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 5, 4, 1, 1>,
+            Conv::template process_tile<1, 0, 5, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 5, 4, 2, 0>,
+            Conv::template process_tile<1, 0, 5, 4, 2, 1>,
+            Conv::template process_tile<1, 0, 5, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 5, 5, 0, 0>,
+            Conv::template process_tile<1, 0, 5, 5, 0, 1>,
+            Conv::template process_tile<1, 0, 5, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 5, 5, 1, 0>,
+            Conv::template process_tile<1, 0, 5, 5, 1, 1>,
+            Conv::template process_tile<1, 0, 5, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 5, 5, 2, 0>,
+            Conv::template process_tile<1, 0, 5, 5, 2, 1>,
+            Conv::template process_tile<1, 0, 5, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 5, 6, 0, 0>,
+            Conv::template process_tile<1, 0, 5, 6, 0, 1>,
+            Conv::template process_tile<1, 0, 5, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 5, 6, 1, 0>,
+            Conv::template process_tile<1, 0, 5, 6, 1, 1>,
+            Conv::template process_tile<1, 0, 5, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 5, 6, 2, 0>,
+            Conv::template process_tile<1, 0, 5, 6, 2, 1>,
+            Conv::template process_tile<1, 0, 5, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 5
+      {  // Input pad bottom = 6
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 6, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 6, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 6, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 6, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 6, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 6, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 6, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 6, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 6, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 6, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 6, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 6, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 6, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 6, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 6, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 6, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 6, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 6, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 6, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 6, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 6, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 6, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 6, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 6, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 6, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 6, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 6, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 6, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 6, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 6, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 6, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 6, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 6, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 6, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 6, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 6, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 6, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 6, 4, 0, 1>,
+            Conv::template process_tile<1, 0, 6, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 6, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 6, 4, 1, 1>,
+            Conv::template process_tile<1, 0, 6, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 6, 4, 2, 0>,
+            Conv::template process_tile<1, 0, 6, 4, 2, 1>,
+            Conv::template process_tile<1, 0, 6, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 6, 5, 0, 0>,
+            Conv::template process_tile<1, 0, 6, 5, 0, 1>,
+            Conv::template process_tile<1, 0, 6, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 6, 5, 1, 0>,
+            Conv::template process_tile<1, 0, 6, 5, 1, 1>,
+            Conv::template process_tile<1, 0, 6, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 6, 5, 2, 0>,
+            Conv::template process_tile<1, 0, 6, 5, 2, 1>,
+            Conv::template process_tile<1, 0, 6, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 6, 6, 0, 0>,
+            Conv::template process_tile<1, 0, 6, 6, 0, 1>,
+            Conv::template process_tile<1, 0, 6, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 6, 6, 1, 0>,
+            Conv::template process_tile<1, 0, 6, 6, 1, 1>,
+            Conv::template process_tile<1, 0, 6, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 6, 6, 2, 0>,
+            Conv::template process_tile<1, 0, 6, 6, 2, 1>,
+            Conv::template process_tile<1, 0, 6, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 6
+    },  // Input pad left = 0
+    {  // Input pad left = 1
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 4, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 4, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 4, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 4, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 5, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 5, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 5, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 5, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 5, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 5, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 6, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 6, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 6, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 6, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 6, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 6, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 4, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 4, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 4, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 4, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 5, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 5, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 5, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 5, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 5, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 5, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 6, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 6, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 6, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 6, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 6, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 6, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 4, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 4, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 4, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 4, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 5, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 5, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 5, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 5, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 5, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 5, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 6, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 6, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 6, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 6, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 6, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 6, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 4, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 4, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 4, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 4, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 5, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 5, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 5, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 5, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 5, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 5, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 6, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 6, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 6, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 6, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 6, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 6, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 3
+      {  // Input pad bottom = 4
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 4, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 4, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 4, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 4, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 4, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 4, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 4, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 4, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 4, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 4, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 4, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 4, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 4, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 4, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 4, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 4, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 4, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 4, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 4, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 4, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 4, 0, 1>,
+            Conv::template process_tile<1, 1, 4, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 4, 1, 1>,
+            Conv::template process_tile<1, 1, 4, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 4, 4, 2, 0>,
+            Conv::template process_tile<1, 1, 4, 4, 2, 1>,
+            Conv::template process_tile<1, 1, 4, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 5, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 5, 0, 1>,
+            Conv::template process_tile<1, 1, 4, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 5, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 5, 1, 1>,
+            Conv::template process_tile<1, 1, 4, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 4, 5, 2, 0>,
+            Conv::template process_tile<1, 1, 4, 5, 2, 1>,
+            Conv::template process_tile<1, 1, 4, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 6, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 6, 0, 1>,
+            Conv::template process_tile<1, 1, 4, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 6, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 6, 1, 1>,
+            Conv::template process_tile<1, 1, 4, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 4, 6, 2, 0>,
+            Conv::template process_tile<1, 1, 4, 6, 2, 1>,
+            Conv::template process_tile<1, 1, 4, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 4
+      {  // Input pad bottom = 5
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 5, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 5, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 5, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 5, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 5, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 5, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 5, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 5, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 5, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 5, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 5, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 5, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 5, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 5, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 5, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 5, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 5, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 5, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 5, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 5, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 5, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 5, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 5, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 5, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 5, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 5, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 5, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 5, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 5, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 5, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 5, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 5, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 5, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 5, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 5, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 5, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 5, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 5, 4, 0, 1>,
+            Conv::template process_tile<1, 1, 5, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 5, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 5, 4, 1, 1>,
+            Conv::template process_tile<1, 1, 5, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 5, 4, 2, 0>,
+            Conv::template process_tile<1, 1, 5, 4, 2, 1>,
+            Conv::template process_tile<1, 1, 5, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 5, 5, 0, 0>,
+            Conv::template process_tile<1, 1, 5, 5, 0, 1>,
+            Conv::template process_tile<1, 1, 5, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 5, 5, 1, 0>,
+            Conv::template process_tile<1, 1, 5, 5, 1, 1>,
+            Conv::template process_tile<1, 1, 5, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 5, 5, 2, 0>,
+            Conv::template process_tile<1, 1, 5, 5, 2, 1>,
+            Conv::template process_tile<1, 1, 5, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 5, 6, 0, 0>,
+            Conv::template process_tile<1, 1, 5, 6, 0, 1>,
+            Conv::template process_tile<1, 1, 5, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 5, 6, 1, 0>,
+            Conv::template process_tile<1, 1, 5, 6, 1, 1>,
+            Conv::template process_tile<1, 1, 5, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 5, 6, 2, 0>,
+            Conv::template process_tile<1, 1, 5, 6, 2, 1>,
+            Conv::template process_tile<1, 1, 5, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 5
+      {  // Input pad bottom = 6
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 6, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 6, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 6, 0, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 6, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 6, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 6, 0, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 6, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 6, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 6, 0, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 6, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 6, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 6, 1, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 6, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 6, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 6, 1, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 6, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 6, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 6, 1, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 6, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 6, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 6, 2, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 6, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 6, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 6, 2, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 6, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 6, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 6, 2, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 6, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 6, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 6, 3, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 6, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 6, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 6, 3, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 6, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 6, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 6, 3, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 6, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 6, 4, 0, 1>,
+            Conv::template process_tile<1, 1, 6, 4, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 6, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 6, 4, 1, 1>,
+            Conv::template process_tile<1, 1, 6, 4, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 6, 4, 2, 0>,
+            Conv::template process_tile<1, 1, 6, 4, 2, 1>,
+            Conv::template process_tile<1, 1, 6, 4, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 6, 5, 0, 0>,
+            Conv::template process_tile<1, 1, 6, 5, 0, 1>,
+            Conv::template process_tile<1, 1, 6, 5, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 6, 5, 1, 0>,
+            Conv::template process_tile<1, 1, 6, 5, 1, 1>,
+            Conv::template process_tile<1, 1, 6, 5, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 6, 5, 2, 0>,
+            Conv::template process_tile<1, 1, 6, 5, 2, 1>,
+            Conv::template process_tile<1, 1, 6, 5, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 6, 6, 0, 0>,
+            Conv::template process_tile<1, 1, 6, 6, 0, 1>,
+            Conv::template process_tile<1, 1, 6, 6, 0, 2>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 6, 6, 1, 0>,
+            Conv::template process_tile<1, 1, 6, 6, 1, 1>,
+            Conv::template process_tile<1, 1, 6, 6, 1, 2>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 6, 6, 2, 0>,
+            Conv::template process_tile<1, 1, 6, 6, 2, 1>,
+            Conv::template process_tile<1, 1, 6, 6, 2, 2>,
+          },  // Output pad bottom = 2
+        },  // Input pad right = 6
+      },  // Input pad bottom = 6
+    },  // Input pad left = 1
+  },  // Input pad top = 1
+};
+
+
+template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>;
+}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
new file mode 100644
index 0000000..a1aaaa0
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp

@@ -0,0 +1,2695 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>;
+using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, float, float>;
+
+template <>
+const Conv::TileFn Conv::tile_fns
+  [max_in_pad_top]
+  [max_in_pad_left]
+  [max_in_pad_bottom]
+  [max_in_pad_right]
+  [max_out_pad_bottom]
+  [max_out_pad_right] = {
+  {  // Input pad top = 0
+    {  // Input pad left = 0
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 0, 0, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 0, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 0, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 0, 0, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 0, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 0, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 0, 0, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 0, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 0, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 0, 0, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 0, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 0, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 0, 1, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 1, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 1, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 0, 1, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 1, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 1, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 0, 1, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 1, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 1, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 0, 1, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 1, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 1, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 0, 2, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 2, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 2, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 0, 2, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 2, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 2, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 0, 2, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 2, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 2, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 0, 2, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 2, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 2, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 0, 3, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 3, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 3, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 0, 3, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 3, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 3, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 0, 3, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 3, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 3, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 0, 3, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 3, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 3, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 0, 4, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 4, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 4, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 0, 4, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 4, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 4, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 0, 4, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 4, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 4, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 0, 4, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 0, 4, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 0, 4, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 0, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 1, 0, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 0, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 0, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 1, 0, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 0, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 0, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 1, 0, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 0, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 0, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 1, 0, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 0, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 0, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 1, 1, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 1, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 1, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 1, 1, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 1, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 1, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 1, 1, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 1, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 1, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 1, 1, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 1, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 1, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 1, 2, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 2, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 2, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 1, 2, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 2, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 2, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 1, 2, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 2, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 2, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 1, 2, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 2, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 2, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 1, 3, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 3, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 3, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 1, 3, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 3, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 3, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 1, 3, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 3, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 3, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 1, 3, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 3, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 3, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 1, 4, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 4, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 4, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 1, 4, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 4, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 4, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 1, 4, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 4, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 4, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 1, 4, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 1, 4, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 1, 4, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 1, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 2, 0, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 0, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 0, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 2, 0, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 0, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 0, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 2, 0, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 0, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 0, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 2, 0, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 0, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 0, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 2, 1, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 1, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 1, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 2, 1, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 1, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 1, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 2, 1, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 1, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 1, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 2, 1, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 1, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 1, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 2, 2, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 2, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 2, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 2, 2, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 2, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 2, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 2, 2, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 2, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 2, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 2, 2, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 2, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 2, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 2, 3, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 3, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 3, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 2, 3, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 3, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 3, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 2, 3, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 3, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 3, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 2, 3, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 3, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 3, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 2, 4, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 4, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 4, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 2, 4, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 4, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 4, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 2, 4, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 4, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 4, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 2, 4, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 2, 4, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 2, 4, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 2, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 3, 0, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 0, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 0, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 3, 0, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 0, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 0, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 3, 0, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 0, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 0, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 3, 0, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 0, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 0, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 3, 1, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 1, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 1, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 3, 1, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 1, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 1, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 3, 1, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 1, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 1, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 3, 1, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 1, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 1, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 3, 2, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 2, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 2, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 3, 2, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 2, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 2, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 3, 2, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 2, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 2, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 3, 2, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 2, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 2, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 3, 3, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 3, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 3, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 3, 3, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 3, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 3, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 3, 3, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 3, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 3, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 3, 3, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 3, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 3, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 3, 4, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 4, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 4, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 3, 4, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 4, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 4, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 3, 4, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 4, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 4, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 3, 4, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 3, 4, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 3, 4, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 3, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 3
+      {  // Input pad bottom = 4
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 4, 0, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 0, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 0, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 4, 0, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 0, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 0, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 4, 0, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 0, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 0, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 4, 0, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 0, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 0, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 4, 1, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 1, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 1, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 4, 1, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 1, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 1, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 4, 1, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 1, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 1, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 4, 1, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 1, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 1, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 4, 2, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 2, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 2, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 4, 2, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 2, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 2, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 4, 2, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 2, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 2, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 4, 2, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 2, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 2, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 4, 3, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 3, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 3, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 4, 3, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 3, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 3, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 4, 3, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 3, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 3, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 4, 3, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 3, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 3, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 0, 4, 4, 0, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 4, 0, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 4, 0, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 0, 4, 4, 1, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 4, 1, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 4, 1, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 0, 4, 4, 2, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 4, 2, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 4, 2, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 0, 4, 4, 3, 0>,
+            ConvImpl::template process_tile<0, 0, 4, 4, 3, 1>,
+            ConvImpl::template process_tile<0, 0, 4, 4, 3, 2>,
+            ConvImpl::template process_tile<0, 0, 4, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 4
+    },  // Input pad left = 0
+    {  // Input pad left = 1
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 0, 0, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 0, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 0, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 0, 0, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 0, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 0, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 0, 0, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 0, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 0, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 0, 0, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 0, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 0, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 0, 1, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 1, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 1, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 0, 1, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 1, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 1, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 0, 1, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 1, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 1, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 0, 1, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 1, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 1, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 0, 2, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 2, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 2, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 0, 2, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 2, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 2, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 0, 2, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 2, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 2, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 0, 2, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 2, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 2, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 0, 3, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 3, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 3, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 0, 3, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 3, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 3, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 0, 3, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 3, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 3, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 0, 3, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 3, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 3, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 0, 4, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 4, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 4, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 0, 4, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 4, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 4, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 0, 4, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 4, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 4, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 0, 4, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 0, 4, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 0, 4, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 0, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 1, 0, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 0, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 0, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 1, 0, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 0, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 0, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 1, 0, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 0, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 0, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 1, 0, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 0, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 0, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 1, 1, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 1, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 1, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 1, 1, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 1, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 1, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 1, 1, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 1, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 1, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 1, 1, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 1, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 1, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 1, 2, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 2, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 2, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 1, 2, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 2, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 2, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 1, 2, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 2, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 2, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 1, 2, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 2, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 2, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 1, 3, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 3, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 3, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 1, 3, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 3, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 3, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 1, 3, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 3, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 3, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 1, 3, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 3, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 3, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 1, 4, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 4, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 4, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 1, 4, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 4, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 4, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 1, 4, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 4, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 4, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 1, 4, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 1, 4, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 1, 4, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 1, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 2, 0, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 0, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 0, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 2, 0, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 0, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 0, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 2, 0, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 0, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 0, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 2, 0, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 0, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 0, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 2, 1, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 1, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 1, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 2, 1, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 1, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 1, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 2, 1, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 1, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 1, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 2, 1, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 1, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 1, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 2, 2, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 2, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 2, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 2, 2, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 2, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 2, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 2, 2, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 2, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 2, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 2, 2, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 2, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 2, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 2, 3, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 3, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 3, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 2, 3, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 3, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 3, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 2, 3, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 3, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 3, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 2, 3, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 3, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 3, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 2, 4, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 4, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 4, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 2, 4, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 4, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 4, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 2, 4, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 4, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 4, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 2, 4, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 2, 4, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 2, 4, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 2, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 3, 0, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 0, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 0, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 3, 0, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 0, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 0, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 3, 0, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 0, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 0, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 3, 0, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 0, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 0, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 3, 1, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 1, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 1, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 3, 1, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 1, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 1, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 3, 1, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 1, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 1, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 3, 1, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 1, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 1, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 3, 2, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 2, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 2, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 3, 2, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 2, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 2, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 3, 2, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 2, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 2, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 3, 2, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 2, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 2, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 3, 3, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 3, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 3, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 3, 3, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 3, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 3, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 3, 3, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 3, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 3, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 3, 3, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 3, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 3, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 3, 4, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 4, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 4, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 3, 4, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 4, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 4, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 3, 4, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 4, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 4, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 3, 4, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 3, 4, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 3, 4, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 3, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 3
+      {  // Input pad bottom = 4
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 4, 0, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 0, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 0, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 4, 0, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 0, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 0, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 4, 0, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 0, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 0, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 4, 0, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 0, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 0, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 4, 1, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 1, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 1, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 4, 1, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 1, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 1, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 4, 1, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 1, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 1, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 4, 1, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 1, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 1, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 4, 2, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 2, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 2, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 4, 2, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 2, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 2, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 4, 2, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 2, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 2, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 4, 2, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 2, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 2, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 4, 3, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 3, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 3, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 4, 3, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 3, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 3, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 4, 3, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 3, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 3, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 4, 3, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 3, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 3, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<0, 1, 4, 4, 0, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 4, 0, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 4, 0, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<0, 1, 4, 4, 1, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 4, 1, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 4, 1, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<0, 1, 4, 4, 2, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 4, 2, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 4, 2, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<0, 1, 4, 4, 3, 0>,
+            ConvImpl::template process_tile<0, 1, 4, 4, 3, 1>,
+            ConvImpl::template process_tile<0, 1, 4, 4, 3, 2>,
+            ConvImpl::template process_tile<0, 1, 4, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 4
+    },  // Input pad left = 1
+  },  // Input pad top = 0
+  {  // Input pad top = 1
+    {  // Input pad left = 0
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 0, 0, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 0, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 0, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 0, 0, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 0, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 0, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 0, 0, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 0, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 0, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 0, 0, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 0, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 0, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 0, 1, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 1, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 1, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 0, 1, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 1, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 1, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 0, 1, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 1, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 1, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 0, 1, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 1, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 1, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 0, 2, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 2, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 2, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 0, 2, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 2, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 2, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 0, 2, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 2, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 2, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 0, 2, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 2, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 2, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 0, 3, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 3, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 3, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 0, 3, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 3, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 3, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 0, 3, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 3, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 3, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 0, 3, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 3, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 3, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 0, 4, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 4, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 4, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 0, 4, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 4, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 4, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 0, 4, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 4, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 4, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 0, 4, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 0, 4, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 0, 4, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 0, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 1, 0, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 0, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 0, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 1, 0, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 0, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 0, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 1, 0, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 0, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 0, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 1, 0, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 0, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 0, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 1, 1, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 1, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 1, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 1, 1, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 1, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 1, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 1, 1, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 1, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 1, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 1, 1, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 1, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 1, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 1, 2, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 2, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 2, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 1, 2, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 2, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 2, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 1, 2, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 2, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 2, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 1, 2, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 2, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 2, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 1, 3, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 3, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 3, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 1, 3, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 3, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 3, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 1, 3, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 3, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 3, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 1, 3, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 3, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 3, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 1, 4, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 4, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 4, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 1, 4, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 4, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 4, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 1, 4, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 4, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 4, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 1, 4, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 1, 4, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 1, 4, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 1, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 2, 0, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 0, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 0, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 2, 0, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 0, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 0, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 2, 0, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 0, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 0, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 2, 0, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 0, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 0, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 2, 1, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 1, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 1, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 2, 1, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 1, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 1, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 2, 1, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 1, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 1, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 2, 1, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 1, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 1, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 2, 2, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 2, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 2, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 2, 2, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 2, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 2, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 2, 2, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 2, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 2, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 2, 2, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 2, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 2, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 2, 3, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 3, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 3, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 2, 3, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 3, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 3, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 2, 3, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 3, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 3, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 2, 3, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 3, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 3, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 2, 4, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 4, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 4, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 2, 4, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 4, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 4, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 2, 4, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 4, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 4, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 2, 4, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 2, 4, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 2, 4, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 2, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 3, 0, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 0, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 0, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 3, 0, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 0, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 0, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 3, 0, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 0, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 0, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 3, 0, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 0, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 0, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 3, 1, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 1, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 1, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 3, 1, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 1, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 1, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 3, 1, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 1, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 1, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 3, 1, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 1, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 1, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 3, 2, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 2, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 2, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 3, 2, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 2, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 2, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 3, 2, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 2, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 2, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 3, 2, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 2, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 2, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 3, 3, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 3, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 3, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 3, 3, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 3, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 3, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 3, 3, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 3, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 3, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 3, 3, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 3, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 3, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 3, 4, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 4, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 4, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 3, 4, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 4, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 4, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 3, 4, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 4, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 4, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 3, 4, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 3, 4, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 3, 4, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 3, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 3
+      {  // Input pad bottom = 4
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 4, 0, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 0, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 0, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 4, 0, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 0, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 0, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 4, 0, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 0, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 0, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 4, 0, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 0, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 0, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 4, 1, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 1, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 1, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 4, 1, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 1, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 1, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 4, 1, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 1, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 1, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 4, 1, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 1, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 1, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 4, 2, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 2, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 2, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 4, 2, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 2, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 2, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 4, 2, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 2, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 2, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 4, 2, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 2, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 2, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 4, 3, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 3, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 3, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 4, 3, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 3, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 3, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 4, 3, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 3, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 3, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 4, 3, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 3, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 3, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 0, 4, 4, 0, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 4, 0, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 4, 0, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 0, 4, 4, 1, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 4, 1, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 4, 1, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 0, 4, 4, 2, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 4, 2, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 4, 2, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 0, 4, 4, 3, 0>,
+            ConvImpl::template process_tile<1, 0, 4, 4, 3, 1>,
+            ConvImpl::template process_tile<1, 0, 4, 4, 3, 2>,
+            ConvImpl::template process_tile<1, 0, 4, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 4
+    },  // Input pad left = 0
+    {  // Input pad left = 1
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 0, 0, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 0, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 0, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 0, 0, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 0, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 0, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 0, 0, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 0, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 0, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 0, 0, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 0, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 0, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 0, 1, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 1, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 1, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 0, 1, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 1, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 1, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 0, 1, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 1, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 1, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 0, 1, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 1, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 1, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 0, 2, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 2, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 2, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 0, 2, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 2, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 2, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 0, 2, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 2, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 2, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 0, 2, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 2, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 2, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 0, 3, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 3, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 3, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 0, 3, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 3, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 3, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 0, 3, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 3, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 3, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 0, 3, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 3, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 3, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 0, 4, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 4, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 4, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 0, 4, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 4, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 4, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 0, 4, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 4, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 4, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 0, 4, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 0, 4, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 0, 4, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 0, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 1, 0, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 0, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 0, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 1, 0, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 0, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 0, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 1, 0, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 0, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 0, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 1, 0, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 0, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 0, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 1, 1, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 1, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 1, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 1, 1, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 1, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 1, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 1, 1, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 1, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 1, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 1, 1, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 1, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 1, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 1, 2, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 2, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 2, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 1, 2, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 2, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 2, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 1, 2, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 2, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 2, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 1, 2, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 2, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 2, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 1, 3, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 3, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 3, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 1, 3, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 3, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 3, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 1, 3, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 3, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 3, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 1, 3, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 3, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 3, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 1, 4, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 4, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 4, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 1, 4, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 4, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 4, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 1, 4, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 4, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 4, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 1, 4, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 1, 4, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 1, 4, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 1, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 2, 0, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 0, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 0, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 2, 0, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 0, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 0, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 2, 0, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 0, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 0, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 2, 0, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 0, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 0, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 2, 1, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 1, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 1, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 2, 1, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 1, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 1, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 2, 1, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 1, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 1, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 2, 1, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 1, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 1, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 2, 2, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 2, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 2, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 2, 2, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 2, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 2, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 2, 2, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 2, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 2, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 2, 2, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 2, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 2, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 2, 3, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 3, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 3, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 2, 3, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 3, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 3, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 2, 3, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 3, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 3, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 2, 3, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 3, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 3, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 2, 4, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 4, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 4, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 2, 4, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 4, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 4, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 2, 4, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 4, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 4, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 2, 4, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 2, 4, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 2, 4, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 2, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 3, 0, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 0, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 0, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 3, 0, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 0, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 0, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 3, 0, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 0, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 0, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 3, 0, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 0, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 0, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 3, 1, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 1, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 1, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 3, 1, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 1, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 1, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 3, 1, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 1, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 1, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 3, 1, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 1, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 1, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 3, 2, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 2, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 2, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 3, 2, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 2, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 2, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 3, 2, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 2, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 2, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 3, 2, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 2, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 2, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 3, 3, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 3, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 3, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 3, 3, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 3, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 3, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 3, 3, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 3, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 3, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 3, 3, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 3, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 3, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 3, 4, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 4, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 4, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 3, 4, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 4, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 4, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 3, 4, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 4, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 4, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 3, 4, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 3, 4, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 3, 4, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 3, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 3
+      {  // Input pad bottom = 4
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 4, 0, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 0, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 0, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 4, 0, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 0, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 0, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 4, 0, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 0, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 0, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 4, 0, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 0, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 0, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 4, 1, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 1, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 1, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 4, 1, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 1, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 1, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 4, 1, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 1, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 1, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 4, 1, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 1, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 1, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 4, 2, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 2, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 2, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 4, 2, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 2, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 2, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 4, 2, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 2, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 2, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 4, 2, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 2, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 2, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 4, 3, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 3, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 3, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 4, 3, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 3, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 3, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 4, 3, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 3, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 3, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 4, 3, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 3, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 3, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            ConvImpl::template process_tile<1, 1, 4, 4, 0, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 4, 0, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 4, 0, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            ConvImpl::template process_tile<1, 1, 4, 4, 1, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 4, 1, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 4, 1, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            ConvImpl::template process_tile<1, 1, 4, 4, 2, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 4, 2, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 4, 2, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            ConvImpl::template process_tile<1, 1, 4, 4, 3, 0>,
+            ConvImpl::template process_tile<1, 1, 4, 4, 3, 1>,
+            ConvImpl::template process_tile<1, 1, 4, 4, 3, 2>,
+            ConvImpl::template process_tile<1, 1, 4, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+      },  // Input pad bottom = 4
+    },  // Input pad left = 1
+  },  // Input pad top = 1
+};
+
+
+template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>;
+}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
new file mode 100644
index 0000000..2104c0b
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp

@@ -0,0 +1,5207 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float>;
+using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, float, float>;
+
+template <>
+const Conv::TileFn Conv::tile_fns
+  [max_in_pad_top]
+  [max_in_pad_left]
+  [max_in_pad_bottom]
+  [max_in_pad_right]
+  [max_out_pad_bottom]
+  [max_out_pad_right] = {
+  {  // Input pad top = 0
+    {  // Input pad left = 0
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 0, 0, 2>,
+            Conv::template process_tile<0, 0, 0, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 0, 1, 2>,
+            Conv::template process_tile<0, 0, 0, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 0, 2, 2>,
+            Conv::template process_tile<0, 0, 0, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 0, 0, 3, 0>,
+            Conv::template process_tile<0, 0, 0, 0, 3, 1>,
+            Conv::template process_tile<0, 0, 0, 0, 3, 2>,
+            Conv::template process_tile<0, 0, 0, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 1, 0, 2>,
+            Conv::template process_tile<0, 0, 0, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 1, 1, 2>,
+            Conv::template process_tile<0, 0, 0, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 1, 2, 2>,
+            Conv::template process_tile<0, 0, 0, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 0, 1, 3, 0>,
+            Conv::template process_tile<0, 0, 0, 1, 3, 1>,
+            Conv::template process_tile<0, 0, 0, 1, 3, 2>,
+            Conv::template process_tile<0, 0, 0, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 2, 0, 2>,
+            Conv::template process_tile<0, 0, 0, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 2, 1, 2>,
+            Conv::template process_tile<0, 0, 0, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 2, 2, 2>,
+            Conv::template process_tile<0, 0, 0, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 0, 2, 3, 0>,
+            Conv::template process_tile<0, 0, 0, 2, 3, 1>,
+            Conv::template process_tile<0, 0, 0, 2, 3, 2>,
+            Conv::template process_tile<0, 0, 0, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 3, 0, 2>,
+            Conv::template process_tile<0, 0, 0, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 3, 1, 2>,
+            Conv::template process_tile<0, 0, 0, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 3, 2, 2>,
+            Conv::template process_tile<0, 0, 0, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 0, 3, 3, 0>,
+            Conv::template process_tile<0, 0, 0, 3, 3, 1>,
+            Conv::template process_tile<0, 0, 0, 3, 3, 2>,
+            Conv::template process_tile<0, 0, 0, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 4, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 4, 0, 2>,
+            Conv::template process_tile<0, 0, 0, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 4, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 4, 1, 2>,
+            Conv::template process_tile<0, 0, 0, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 4, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 4, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 4, 2, 2>,
+            Conv::template process_tile<0, 0, 0, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 0, 4, 3, 0>,
+            Conv::template process_tile<0, 0, 0, 4, 3, 1>,
+            Conv::template process_tile<0, 0, 0, 4, 3, 2>,
+            Conv::template process_tile<0, 0, 0, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 5, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 5, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 5, 0, 2>,
+            Conv::template process_tile<0, 0, 0, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 5, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 5, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 5, 1, 2>,
+            Conv::template process_tile<0, 0, 0, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 5, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 5, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 5, 2, 2>,
+            Conv::template process_tile<0, 0, 0, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 0, 5, 3, 0>,
+            Conv::template process_tile<0, 0, 0, 5, 3, 1>,
+            Conv::template process_tile<0, 0, 0, 5, 3, 2>,
+            Conv::template process_tile<0, 0, 0, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 0, 6, 0, 0>,
+            Conv::template process_tile<0, 0, 0, 6, 0, 1>,
+            Conv::template process_tile<0, 0, 0, 6, 0, 2>,
+            Conv::template process_tile<0, 0, 0, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 0, 6, 1, 0>,
+            Conv::template process_tile<0, 0, 0, 6, 1, 1>,
+            Conv::template process_tile<0, 0, 0, 6, 1, 2>,
+            Conv::template process_tile<0, 0, 0, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 0, 6, 2, 0>,
+            Conv::template process_tile<0, 0, 0, 6, 2, 1>,
+            Conv::template process_tile<0, 0, 0, 6, 2, 2>,
+            Conv::template process_tile<0, 0, 0, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 0, 6, 3, 0>,
+            Conv::template process_tile<0, 0, 0, 6, 3, 1>,
+            Conv::template process_tile<0, 0, 0, 6, 3, 2>,
+            Conv::template process_tile<0, 0, 0, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 0, 0, 2>,
+            Conv::template process_tile<0, 0, 1, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 0, 1, 2>,
+            Conv::template process_tile<0, 0, 1, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 0, 2, 2>,
+            Conv::template process_tile<0, 0, 1, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 1, 0, 3, 0>,
+            Conv::template process_tile<0, 0, 1, 0, 3, 1>,
+            Conv::template process_tile<0, 0, 1, 0, 3, 2>,
+            Conv::template process_tile<0, 0, 1, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 1, 0, 2>,
+            Conv::template process_tile<0, 0, 1, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 1, 1, 2>,
+            Conv::template process_tile<0, 0, 1, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 1, 2, 2>,
+            Conv::template process_tile<0, 0, 1, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 1, 1, 3, 0>,
+            Conv::template process_tile<0, 0, 1, 1, 3, 1>,
+            Conv::template process_tile<0, 0, 1, 1, 3, 2>,
+            Conv::template process_tile<0, 0, 1, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 2, 0, 2>,
+            Conv::template process_tile<0, 0, 1, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 2, 1, 2>,
+            Conv::template process_tile<0, 0, 1, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 2, 2, 2>,
+            Conv::template process_tile<0, 0, 1, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 1, 2, 3, 0>,
+            Conv::template process_tile<0, 0, 1, 2, 3, 1>,
+            Conv::template process_tile<0, 0, 1, 2, 3, 2>,
+            Conv::template process_tile<0, 0, 1, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 3, 0, 2>,
+            Conv::template process_tile<0, 0, 1, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 3, 1, 2>,
+            Conv::template process_tile<0, 0, 1, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 3, 2, 2>,
+            Conv::template process_tile<0, 0, 1, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 1, 3, 3, 0>,
+            Conv::template process_tile<0, 0, 1, 3, 3, 1>,
+            Conv::template process_tile<0, 0, 1, 3, 3, 2>,
+            Conv::template process_tile<0, 0, 1, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 4, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 4, 0, 2>,
+            Conv::template process_tile<0, 0, 1, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 4, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 4, 1, 2>,
+            Conv::template process_tile<0, 0, 1, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 4, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 4, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 4, 2, 2>,
+            Conv::template process_tile<0, 0, 1, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 1, 4, 3, 0>,
+            Conv::template process_tile<0, 0, 1, 4, 3, 1>,
+            Conv::template process_tile<0, 0, 1, 4, 3, 2>,
+            Conv::template process_tile<0, 0, 1, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 5, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 5, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 5, 0, 2>,
+            Conv::template process_tile<0, 0, 1, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 5, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 5, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 5, 1, 2>,
+            Conv::template process_tile<0, 0, 1, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 5, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 5, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 5, 2, 2>,
+            Conv::template process_tile<0, 0, 1, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 1, 5, 3, 0>,
+            Conv::template process_tile<0, 0, 1, 5, 3, 1>,
+            Conv::template process_tile<0, 0, 1, 5, 3, 2>,
+            Conv::template process_tile<0, 0, 1, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 1, 6, 0, 0>,
+            Conv::template process_tile<0, 0, 1, 6, 0, 1>,
+            Conv::template process_tile<0, 0, 1, 6, 0, 2>,
+            Conv::template process_tile<0, 0, 1, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 1, 6, 1, 0>,
+            Conv::template process_tile<0, 0, 1, 6, 1, 1>,
+            Conv::template process_tile<0, 0, 1, 6, 1, 2>,
+            Conv::template process_tile<0, 0, 1, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 1, 6, 2, 0>,
+            Conv::template process_tile<0, 0, 1, 6, 2, 1>,
+            Conv::template process_tile<0, 0, 1, 6, 2, 2>,
+            Conv::template process_tile<0, 0, 1, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 1, 6, 3, 0>,
+            Conv::template process_tile<0, 0, 1, 6, 3, 1>,
+            Conv::template process_tile<0, 0, 1, 6, 3, 2>,
+            Conv::template process_tile<0, 0, 1, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 0, 0, 2>,
+            Conv::template process_tile<0, 0, 2, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 0, 1, 2>,
+            Conv::template process_tile<0, 0, 2, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 0, 2, 2>,
+            Conv::template process_tile<0, 0, 2, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 2, 0, 3, 0>,
+            Conv::template process_tile<0, 0, 2, 0, 3, 1>,
+            Conv::template process_tile<0, 0, 2, 0, 3, 2>,
+            Conv::template process_tile<0, 0, 2, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 1, 0, 2>,
+            Conv::template process_tile<0, 0, 2, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 1, 1, 2>,
+            Conv::template process_tile<0, 0, 2, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 1, 2, 2>,
+            Conv::template process_tile<0, 0, 2, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 2, 1, 3, 0>,
+            Conv::template process_tile<0, 0, 2, 1, 3, 1>,
+            Conv::template process_tile<0, 0, 2, 1, 3, 2>,
+            Conv::template process_tile<0, 0, 2, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 2, 0, 2>,
+            Conv::template process_tile<0, 0, 2, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 2, 1, 2>,
+            Conv::template process_tile<0, 0, 2, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 2, 2, 2>,
+            Conv::template process_tile<0, 0, 2, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 2, 2, 3, 0>,
+            Conv::template process_tile<0, 0, 2, 2, 3, 1>,
+            Conv::template process_tile<0, 0, 2, 2, 3, 2>,
+            Conv::template process_tile<0, 0, 2, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 3, 0, 2>,
+            Conv::template process_tile<0, 0, 2, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 3, 1, 2>,
+            Conv::template process_tile<0, 0, 2, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 3, 2, 2>,
+            Conv::template process_tile<0, 0, 2, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 2, 3, 3, 0>,
+            Conv::template process_tile<0, 0, 2, 3, 3, 1>,
+            Conv::template process_tile<0, 0, 2, 3, 3, 2>,
+            Conv::template process_tile<0, 0, 2, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 4, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 4, 0, 2>,
+            Conv::template process_tile<0, 0, 2, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 4, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 4, 1, 2>,
+            Conv::template process_tile<0, 0, 2, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 4, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 4, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 4, 2, 2>,
+            Conv::template process_tile<0, 0, 2, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 2, 4, 3, 0>,
+            Conv::template process_tile<0, 0, 2, 4, 3, 1>,
+            Conv::template process_tile<0, 0, 2, 4, 3, 2>,
+            Conv::template process_tile<0, 0, 2, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 5, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 5, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 5, 0, 2>,
+            Conv::template process_tile<0, 0, 2, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 5, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 5, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 5, 1, 2>,
+            Conv::template process_tile<0, 0, 2, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 5, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 5, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 5, 2, 2>,
+            Conv::template process_tile<0, 0, 2, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 2, 5, 3, 0>,
+            Conv::template process_tile<0, 0, 2, 5, 3, 1>,
+            Conv::template process_tile<0, 0, 2, 5, 3, 2>,
+            Conv::template process_tile<0, 0, 2, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 2, 6, 0, 0>,
+            Conv::template process_tile<0, 0, 2, 6, 0, 1>,
+            Conv::template process_tile<0, 0, 2, 6, 0, 2>,
+            Conv::template process_tile<0, 0, 2, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 2, 6, 1, 0>,
+            Conv::template process_tile<0, 0, 2, 6, 1, 1>,
+            Conv::template process_tile<0, 0, 2, 6, 1, 2>,
+            Conv::template process_tile<0, 0, 2, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 2, 6, 2, 0>,
+            Conv::template process_tile<0, 0, 2, 6, 2, 1>,
+            Conv::template process_tile<0, 0, 2, 6, 2, 2>,
+            Conv::template process_tile<0, 0, 2, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 2, 6, 3, 0>,
+            Conv::template process_tile<0, 0, 2, 6, 3, 1>,
+            Conv::template process_tile<0, 0, 2, 6, 3, 2>,
+            Conv::template process_tile<0, 0, 2, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 0, 0, 2>,
+            Conv::template process_tile<0, 0, 3, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 0, 1, 2>,
+            Conv::template process_tile<0, 0, 3, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 0, 2, 2>,
+            Conv::template process_tile<0, 0, 3, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 3, 0, 3, 0>,
+            Conv::template process_tile<0, 0, 3, 0, 3, 1>,
+            Conv::template process_tile<0, 0, 3, 0, 3, 2>,
+            Conv::template process_tile<0, 0, 3, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 1, 0, 2>,
+            Conv::template process_tile<0, 0, 3, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 1, 1, 2>,
+            Conv::template process_tile<0, 0, 3, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 1, 2, 2>,
+            Conv::template process_tile<0, 0, 3, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 3, 1, 3, 0>,
+            Conv::template process_tile<0, 0, 3, 1, 3, 1>,
+            Conv::template process_tile<0, 0, 3, 1, 3, 2>,
+            Conv::template process_tile<0, 0, 3, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 2, 0, 2>,
+            Conv::template process_tile<0, 0, 3, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 2, 1, 2>,
+            Conv::template process_tile<0, 0, 3, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 2, 2, 2>,
+            Conv::template process_tile<0, 0, 3, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 3, 2, 3, 0>,
+            Conv::template process_tile<0, 0, 3, 2, 3, 1>,
+            Conv::template process_tile<0, 0, 3, 2, 3, 2>,
+            Conv::template process_tile<0, 0, 3, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 3, 0, 2>,
+            Conv::template process_tile<0, 0, 3, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 3, 1, 2>,
+            Conv::template process_tile<0, 0, 3, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 3, 2, 2>,
+            Conv::template process_tile<0, 0, 3, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 3, 3, 3, 0>,
+            Conv::template process_tile<0, 0, 3, 3, 3, 1>,
+            Conv::template process_tile<0, 0, 3, 3, 3, 2>,
+            Conv::template process_tile<0, 0, 3, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 4, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 4, 0, 2>,
+            Conv::template process_tile<0, 0, 3, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 4, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 4, 1, 2>,
+            Conv::template process_tile<0, 0, 3, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 4, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 4, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 4, 2, 2>,
+            Conv::template process_tile<0, 0, 3, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 3, 4, 3, 0>,
+            Conv::template process_tile<0, 0, 3, 4, 3, 1>,
+            Conv::template process_tile<0, 0, 3, 4, 3, 2>,
+            Conv::template process_tile<0, 0, 3, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 5, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 5, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 5, 0, 2>,
+            Conv::template process_tile<0, 0, 3, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 5, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 5, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 5, 1, 2>,
+            Conv::template process_tile<0, 0, 3, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 5, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 5, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 5, 2, 2>,
+            Conv::template process_tile<0, 0, 3, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 3, 5, 3, 0>,
+            Conv::template process_tile<0, 0, 3, 5, 3, 1>,
+            Conv::template process_tile<0, 0, 3, 5, 3, 2>,
+            Conv::template process_tile<0, 0, 3, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 3, 6, 0, 0>,
+            Conv::template process_tile<0, 0, 3, 6, 0, 1>,
+            Conv::template process_tile<0, 0, 3, 6, 0, 2>,
+            Conv::template process_tile<0, 0, 3, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 3, 6, 1, 0>,
+            Conv::template process_tile<0, 0, 3, 6, 1, 1>,
+            Conv::template process_tile<0, 0, 3, 6, 1, 2>,
+            Conv::template process_tile<0, 0, 3, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 3, 6, 2, 0>,
+            Conv::template process_tile<0, 0, 3, 6, 2, 1>,
+            Conv::template process_tile<0, 0, 3, 6, 2, 2>,
+            Conv::template process_tile<0, 0, 3, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 3, 6, 3, 0>,
+            Conv::template process_tile<0, 0, 3, 6, 3, 1>,
+            Conv::template process_tile<0, 0, 3, 6, 3, 2>,
+            Conv::template process_tile<0, 0, 3, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 3
+      {  // Input pad bottom = 4
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 4, 0, 0, 2>,
+            Conv::template process_tile<0, 0, 4, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 4, 0, 1, 2>,
+            Conv::template process_tile<0, 0, 4, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 4, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 4, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 4, 0, 2, 2>,
+            Conv::template process_tile<0, 0, 4, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 4, 0, 3, 0>,
+            Conv::template process_tile<0, 0, 4, 0, 3, 1>,
+            Conv::template process_tile<0, 0, 4, 0, 3, 2>,
+            Conv::template process_tile<0, 0, 4, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 4, 1, 0, 2>,
+            Conv::template process_tile<0, 0, 4, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 4, 1, 1, 2>,
+            Conv::template process_tile<0, 0, 4, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 4, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 4, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 4, 1, 2, 2>,
+            Conv::template process_tile<0, 0, 4, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 4, 1, 3, 0>,
+            Conv::template process_tile<0, 0, 4, 1, 3, 1>,
+            Conv::template process_tile<0, 0, 4, 1, 3, 2>,
+            Conv::template process_tile<0, 0, 4, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 4, 2, 0, 2>,
+            Conv::template process_tile<0, 0, 4, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 4, 2, 1, 2>,
+            Conv::template process_tile<0, 0, 4, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 4, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 4, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 4, 2, 2, 2>,
+            Conv::template process_tile<0, 0, 4, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 4, 2, 3, 0>,
+            Conv::template process_tile<0, 0, 4, 2, 3, 1>,
+            Conv::template process_tile<0, 0, 4, 2, 3, 2>,
+            Conv::template process_tile<0, 0, 4, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 4, 3, 0, 2>,
+            Conv::template process_tile<0, 0, 4, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 4, 3, 1, 2>,
+            Conv::template process_tile<0, 0, 4, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 4, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 4, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 4, 3, 2, 2>,
+            Conv::template process_tile<0, 0, 4, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 4, 3, 3, 0>,
+            Conv::template process_tile<0, 0, 4, 3, 3, 1>,
+            Conv::template process_tile<0, 0, 4, 3, 3, 2>,
+            Conv::template process_tile<0, 0, 4, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 4, 0, 1>,
+            Conv::template process_tile<0, 0, 4, 4, 0, 2>,
+            Conv::template process_tile<0, 0, 4, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 4, 1, 1>,
+            Conv::template process_tile<0, 0, 4, 4, 1, 2>,
+            Conv::template process_tile<0, 0, 4, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 4, 4, 2, 0>,
+            Conv::template process_tile<0, 0, 4, 4, 2, 1>,
+            Conv::template process_tile<0, 0, 4, 4, 2, 2>,
+            Conv::template process_tile<0, 0, 4, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 4, 4, 3, 0>,
+            Conv::template process_tile<0, 0, 4, 4, 3, 1>,
+            Conv::template process_tile<0, 0, 4, 4, 3, 2>,
+            Conv::template process_tile<0, 0, 4, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 5, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 5, 0, 1>,
+            Conv::template process_tile<0, 0, 4, 5, 0, 2>,
+            Conv::template process_tile<0, 0, 4, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 5, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 5, 1, 1>,
+            Conv::template process_tile<0, 0, 4, 5, 1, 2>,
+            Conv::template process_tile<0, 0, 4, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 4, 5, 2, 0>,
+            Conv::template process_tile<0, 0, 4, 5, 2, 1>,
+            Conv::template process_tile<0, 0, 4, 5, 2, 2>,
+            Conv::template process_tile<0, 0, 4, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 4, 5, 3, 0>,
+            Conv::template process_tile<0, 0, 4, 5, 3, 1>,
+            Conv::template process_tile<0, 0, 4, 5, 3, 2>,
+            Conv::template process_tile<0, 0, 4, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 4, 6, 0, 0>,
+            Conv::template process_tile<0, 0, 4, 6, 0, 1>,
+            Conv::template process_tile<0, 0, 4, 6, 0, 2>,
+            Conv::template process_tile<0, 0, 4, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 4, 6, 1, 0>,
+            Conv::template process_tile<0, 0, 4, 6, 1, 1>,
+            Conv::template process_tile<0, 0, 4, 6, 1, 2>,
+            Conv::template process_tile<0, 0, 4, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 4, 6, 2, 0>,
+            Conv::template process_tile<0, 0, 4, 6, 2, 1>,
+            Conv::template process_tile<0, 0, 4, 6, 2, 2>,
+            Conv::template process_tile<0, 0, 4, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 4, 6, 3, 0>,
+            Conv::template process_tile<0, 0, 4, 6, 3, 1>,
+            Conv::template process_tile<0, 0, 4, 6, 3, 2>,
+            Conv::template process_tile<0, 0, 4, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 4
+      {  // Input pad bottom = 5
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 5, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 5, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 5, 0, 0, 2>,
+            Conv::template process_tile<0, 0, 5, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 5, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 5, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 5, 0, 1, 2>,
+            Conv::template process_tile<0, 0, 5, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 5, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 5, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 5, 0, 2, 2>,
+            Conv::template process_tile<0, 0, 5, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 5, 0, 3, 0>,
+            Conv::template process_tile<0, 0, 5, 0, 3, 1>,
+            Conv::template process_tile<0, 0, 5, 0, 3, 2>,
+            Conv::template process_tile<0, 0, 5, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 5, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 5, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 5, 1, 0, 2>,
+            Conv::template process_tile<0, 0, 5, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 5, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 5, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 5, 1, 1, 2>,
+            Conv::template process_tile<0, 0, 5, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 5, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 5, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 5, 1, 2, 2>,
+            Conv::template process_tile<0, 0, 5, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 5, 1, 3, 0>,
+            Conv::template process_tile<0, 0, 5, 1, 3, 1>,
+            Conv::template process_tile<0, 0, 5, 1, 3, 2>,
+            Conv::template process_tile<0, 0, 5, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 5, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 5, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 5, 2, 0, 2>,
+            Conv::template process_tile<0, 0, 5, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 5, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 5, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 5, 2, 1, 2>,
+            Conv::template process_tile<0, 0, 5, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 5, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 5, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 5, 2, 2, 2>,
+            Conv::template process_tile<0, 0, 5, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 5, 2, 3, 0>,
+            Conv::template process_tile<0, 0, 5, 2, 3, 1>,
+            Conv::template process_tile<0, 0, 5, 2, 3, 2>,
+            Conv::template process_tile<0, 0, 5, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 5, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 5, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 5, 3, 0, 2>,
+            Conv::template process_tile<0, 0, 5, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 5, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 5, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 5, 3, 1, 2>,
+            Conv::template process_tile<0, 0, 5, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 5, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 5, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 5, 3, 2, 2>,
+            Conv::template process_tile<0, 0, 5, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 5, 3, 3, 0>,
+            Conv::template process_tile<0, 0, 5, 3, 3, 1>,
+            Conv::template process_tile<0, 0, 5, 3, 3, 2>,
+            Conv::template process_tile<0, 0, 5, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 5, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 5, 4, 0, 1>,
+            Conv::template process_tile<0, 0, 5, 4, 0, 2>,
+            Conv::template process_tile<0, 0, 5, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 5, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 5, 4, 1, 1>,
+            Conv::template process_tile<0, 0, 5, 4, 1, 2>,
+            Conv::template process_tile<0, 0, 5, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 5, 4, 2, 0>,
+            Conv::template process_tile<0, 0, 5, 4, 2, 1>,
+            Conv::template process_tile<0, 0, 5, 4, 2, 2>,
+            Conv::template process_tile<0, 0, 5, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 5, 4, 3, 0>,
+            Conv::template process_tile<0, 0, 5, 4, 3, 1>,
+            Conv::template process_tile<0, 0, 5, 4, 3, 2>,
+            Conv::template process_tile<0, 0, 5, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 5, 5, 0, 0>,
+            Conv::template process_tile<0, 0, 5, 5, 0, 1>,
+            Conv::template process_tile<0, 0, 5, 5, 0, 2>,
+            Conv::template process_tile<0, 0, 5, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 5, 5, 1, 0>,
+            Conv::template process_tile<0, 0, 5, 5, 1, 1>,
+            Conv::template process_tile<0, 0, 5, 5, 1, 2>,
+            Conv::template process_tile<0, 0, 5, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 5, 5, 2, 0>,
+            Conv::template process_tile<0, 0, 5, 5, 2, 1>,
+            Conv::template process_tile<0, 0, 5, 5, 2, 2>,
+            Conv::template process_tile<0, 0, 5, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 5, 5, 3, 0>,
+            Conv::template process_tile<0, 0, 5, 5, 3, 1>,
+            Conv::template process_tile<0, 0, 5, 5, 3, 2>,
+            Conv::template process_tile<0, 0, 5, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 5, 6, 0, 0>,
+            Conv::template process_tile<0, 0, 5, 6, 0, 1>,
+            Conv::template process_tile<0, 0, 5, 6, 0, 2>,
+            Conv::template process_tile<0, 0, 5, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 5, 6, 1, 0>,
+            Conv::template process_tile<0, 0, 5, 6, 1, 1>,
+            Conv::template process_tile<0, 0, 5, 6, 1, 2>,
+            Conv::template process_tile<0, 0, 5, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 5, 6, 2, 0>,
+            Conv::template process_tile<0, 0, 5, 6, 2, 1>,
+            Conv::template process_tile<0, 0, 5, 6, 2, 2>,
+            Conv::template process_tile<0, 0, 5, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 5, 6, 3, 0>,
+            Conv::template process_tile<0, 0, 5, 6, 3, 1>,
+            Conv::template process_tile<0, 0, 5, 6, 3, 2>,
+            Conv::template process_tile<0, 0, 5, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 5
+      {  // Input pad bottom = 6
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 6, 0, 0, 0>,
+            Conv::template process_tile<0, 0, 6, 0, 0, 1>,
+            Conv::template process_tile<0, 0, 6, 0, 0, 2>,
+            Conv::template process_tile<0, 0, 6, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 6, 0, 1, 0>,
+            Conv::template process_tile<0, 0, 6, 0, 1, 1>,
+            Conv::template process_tile<0, 0, 6, 0, 1, 2>,
+            Conv::template process_tile<0, 0, 6, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 6, 0, 2, 0>,
+            Conv::template process_tile<0, 0, 6, 0, 2, 1>,
+            Conv::template process_tile<0, 0, 6, 0, 2, 2>,
+            Conv::template process_tile<0, 0, 6, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 6, 0, 3, 0>,
+            Conv::template process_tile<0, 0, 6, 0, 3, 1>,
+            Conv::template process_tile<0, 0, 6, 0, 3, 2>,
+            Conv::template process_tile<0, 0, 6, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 6, 1, 0, 0>,
+            Conv::template process_tile<0, 0, 6, 1, 0, 1>,
+            Conv::template process_tile<0, 0, 6, 1, 0, 2>,
+            Conv::template process_tile<0, 0, 6, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 6, 1, 1, 0>,
+            Conv::template process_tile<0, 0, 6, 1, 1, 1>,
+            Conv::template process_tile<0, 0, 6, 1, 1, 2>,
+            Conv::template process_tile<0, 0, 6, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 6, 1, 2, 0>,
+            Conv::template process_tile<0, 0, 6, 1, 2, 1>,
+            Conv::template process_tile<0, 0, 6, 1, 2, 2>,
+            Conv::template process_tile<0, 0, 6, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 6, 1, 3, 0>,
+            Conv::template process_tile<0, 0, 6, 1, 3, 1>,
+            Conv::template process_tile<0, 0, 6, 1, 3, 2>,
+            Conv::template process_tile<0, 0, 6, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 6, 2, 0, 0>,
+            Conv::template process_tile<0, 0, 6, 2, 0, 1>,
+            Conv::template process_tile<0, 0, 6, 2, 0, 2>,
+            Conv::template process_tile<0, 0, 6, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 6, 2, 1, 0>,
+            Conv::template process_tile<0, 0, 6, 2, 1, 1>,
+            Conv::template process_tile<0, 0, 6, 2, 1, 2>,
+            Conv::template process_tile<0, 0, 6, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 6, 2, 2, 0>,
+            Conv::template process_tile<0, 0, 6, 2, 2, 1>,
+            Conv::template process_tile<0, 0, 6, 2, 2, 2>,
+            Conv::template process_tile<0, 0, 6, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 6, 2, 3, 0>,
+            Conv::template process_tile<0, 0, 6, 2, 3, 1>,
+            Conv::template process_tile<0, 0, 6, 2, 3, 2>,
+            Conv::template process_tile<0, 0, 6, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 6, 3, 0, 0>,
+            Conv::template process_tile<0, 0, 6, 3, 0, 1>,
+            Conv::template process_tile<0, 0, 6, 3, 0, 2>,
+            Conv::template process_tile<0, 0, 6, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 6, 3, 1, 0>,
+            Conv::template process_tile<0, 0, 6, 3, 1, 1>,
+            Conv::template process_tile<0, 0, 6, 3, 1, 2>,
+            Conv::template process_tile<0, 0, 6, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 6, 3, 2, 0>,
+            Conv::template process_tile<0, 0, 6, 3, 2, 1>,
+            Conv::template process_tile<0, 0, 6, 3, 2, 2>,
+            Conv::template process_tile<0, 0, 6, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 6, 3, 3, 0>,
+            Conv::template process_tile<0, 0, 6, 3, 3, 1>,
+            Conv::template process_tile<0, 0, 6, 3, 3, 2>,
+            Conv::template process_tile<0, 0, 6, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 6, 4, 0, 0>,
+            Conv::template process_tile<0, 0, 6, 4, 0, 1>,
+            Conv::template process_tile<0, 0, 6, 4, 0, 2>,
+            Conv::template process_tile<0, 0, 6, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 6, 4, 1, 0>,
+            Conv::template process_tile<0, 0, 6, 4, 1, 1>,
+            Conv::template process_tile<0, 0, 6, 4, 1, 2>,
+            Conv::template process_tile<0, 0, 6, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 6, 4, 2, 0>,
+            Conv::template process_tile<0, 0, 6, 4, 2, 1>,
+            Conv::template process_tile<0, 0, 6, 4, 2, 2>,
+            Conv::template process_tile<0, 0, 6, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 6, 4, 3, 0>,
+            Conv::template process_tile<0, 0, 6, 4, 3, 1>,
+            Conv::template process_tile<0, 0, 6, 4, 3, 2>,
+            Conv::template process_tile<0, 0, 6, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 6, 5, 0, 0>,
+            Conv::template process_tile<0, 0, 6, 5, 0, 1>,
+            Conv::template process_tile<0, 0, 6, 5, 0, 2>,
+            Conv::template process_tile<0, 0, 6, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 6, 5, 1, 0>,
+            Conv::template process_tile<0, 0, 6, 5, 1, 1>,
+            Conv::template process_tile<0, 0, 6, 5, 1, 2>,
+            Conv::template process_tile<0, 0, 6, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 6, 5, 2, 0>,
+            Conv::template process_tile<0, 0, 6, 5, 2, 1>,
+            Conv::template process_tile<0, 0, 6, 5, 2, 2>,
+            Conv::template process_tile<0, 0, 6, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 6, 5, 3, 0>,
+            Conv::template process_tile<0, 0, 6, 5, 3, 1>,
+            Conv::template process_tile<0, 0, 6, 5, 3, 2>,
+            Conv::template process_tile<0, 0, 6, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 0, 6, 6, 0, 0>,
+            Conv::template process_tile<0, 0, 6, 6, 0, 1>,
+            Conv::template process_tile<0, 0, 6, 6, 0, 2>,
+            Conv::template process_tile<0, 0, 6, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 0, 6, 6, 1, 0>,
+            Conv::template process_tile<0, 0, 6, 6, 1, 1>,
+            Conv::template process_tile<0, 0, 6, 6, 1, 2>,
+            Conv::template process_tile<0, 0, 6, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 0, 6, 6, 2, 0>,
+            Conv::template process_tile<0, 0, 6, 6, 2, 1>,
+            Conv::template process_tile<0, 0, 6, 6, 2, 2>,
+            Conv::template process_tile<0, 0, 6, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 0, 6, 6, 3, 0>,
+            Conv::template process_tile<0, 0, 6, 6, 3, 1>,
+            Conv::template process_tile<0, 0, 6, 6, 3, 2>,
+            Conv::template process_tile<0, 0, 6, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 6
+    },  // Input pad left = 0
+    {  // Input pad left = 1
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 0, 0, 2>,
+            Conv::template process_tile<0, 1, 0, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 0, 1, 2>,
+            Conv::template process_tile<0, 1, 0, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 0, 2, 2>,
+            Conv::template process_tile<0, 1, 0, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 0, 0, 3, 0>,
+            Conv::template process_tile<0, 1, 0, 0, 3, 1>,
+            Conv::template process_tile<0, 1, 0, 0, 3, 2>,
+            Conv::template process_tile<0, 1, 0, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 1, 0, 2>,
+            Conv::template process_tile<0, 1, 0, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 1, 1, 2>,
+            Conv::template process_tile<0, 1, 0, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 1, 2, 2>,
+            Conv::template process_tile<0, 1, 0, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 0, 1, 3, 0>,
+            Conv::template process_tile<0, 1, 0, 1, 3, 1>,
+            Conv::template process_tile<0, 1, 0, 1, 3, 2>,
+            Conv::template process_tile<0, 1, 0, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 2, 0, 2>,
+            Conv::template process_tile<0, 1, 0, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 2, 1, 2>,
+            Conv::template process_tile<0, 1, 0, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 2, 2, 2>,
+            Conv::template process_tile<0, 1, 0, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 0, 2, 3, 0>,
+            Conv::template process_tile<0, 1, 0, 2, 3, 1>,
+            Conv::template process_tile<0, 1, 0, 2, 3, 2>,
+            Conv::template process_tile<0, 1, 0, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 3, 0, 2>,
+            Conv::template process_tile<0, 1, 0, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 3, 1, 2>,
+            Conv::template process_tile<0, 1, 0, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 3, 2, 2>,
+            Conv::template process_tile<0, 1, 0, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 0, 3, 3, 0>,
+            Conv::template process_tile<0, 1, 0, 3, 3, 1>,
+            Conv::template process_tile<0, 1, 0, 3, 3, 2>,
+            Conv::template process_tile<0, 1, 0, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 4, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 4, 0, 2>,
+            Conv::template process_tile<0, 1, 0, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 4, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 4, 1, 2>,
+            Conv::template process_tile<0, 1, 0, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 4, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 4, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 4, 2, 2>,
+            Conv::template process_tile<0, 1, 0, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 0, 4, 3, 0>,
+            Conv::template process_tile<0, 1, 0, 4, 3, 1>,
+            Conv::template process_tile<0, 1, 0, 4, 3, 2>,
+            Conv::template process_tile<0, 1, 0, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 5, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 5, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 5, 0, 2>,
+            Conv::template process_tile<0, 1, 0, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 5, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 5, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 5, 1, 2>,
+            Conv::template process_tile<0, 1, 0, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 5, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 5, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 5, 2, 2>,
+            Conv::template process_tile<0, 1, 0, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 0, 5, 3, 0>,
+            Conv::template process_tile<0, 1, 0, 5, 3, 1>,
+            Conv::template process_tile<0, 1, 0, 5, 3, 2>,
+            Conv::template process_tile<0, 1, 0, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 0, 6, 0, 0>,
+            Conv::template process_tile<0, 1, 0, 6, 0, 1>,
+            Conv::template process_tile<0, 1, 0, 6, 0, 2>,
+            Conv::template process_tile<0, 1, 0, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 0, 6, 1, 0>,
+            Conv::template process_tile<0, 1, 0, 6, 1, 1>,
+            Conv::template process_tile<0, 1, 0, 6, 1, 2>,
+            Conv::template process_tile<0, 1, 0, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 0, 6, 2, 0>,
+            Conv::template process_tile<0, 1, 0, 6, 2, 1>,
+            Conv::template process_tile<0, 1, 0, 6, 2, 2>,
+            Conv::template process_tile<0, 1, 0, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 0, 6, 3, 0>,
+            Conv::template process_tile<0, 1, 0, 6, 3, 1>,
+            Conv::template process_tile<0, 1, 0, 6, 3, 2>,
+            Conv::template process_tile<0, 1, 0, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 0, 0, 2>,
+            Conv::template process_tile<0, 1, 1, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 0, 1, 2>,
+            Conv::template process_tile<0, 1, 1, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 0, 2, 2>,
+            Conv::template process_tile<0, 1, 1, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 1, 0, 3, 0>,
+            Conv::template process_tile<0, 1, 1, 0, 3, 1>,
+            Conv::template process_tile<0, 1, 1, 0, 3, 2>,
+            Conv::template process_tile<0, 1, 1, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 1, 0, 2>,
+            Conv::template process_tile<0, 1, 1, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 1, 1, 2>,
+            Conv::template process_tile<0, 1, 1, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 1, 2, 2>,
+            Conv::template process_tile<0, 1, 1, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 1, 1, 3, 0>,
+            Conv::template process_tile<0, 1, 1, 1, 3, 1>,
+            Conv::template process_tile<0, 1, 1, 1, 3, 2>,
+            Conv::template process_tile<0, 1, 1, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 2, 0, 2>,
+            Conv::template process_tile<0, 1, 1, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 2, 1, 2>,
+            Conv::template process_tile<0, 1, 1, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 2, 2, 2>,
+            Conv::template process_tile<0, 1, 1, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 1, 2, 3, 0>,
+            Conv::template process_tile<0, 1, 1, 2, 3, 1>,
+            Conv::template process_tile<0, 1, 1, 2, 3, 2>,
+            Conv::template process_tile<0, 1, 1, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 3, 0, 2>,
+            Conv::template process_tile<0, 1, 1, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 3, 1, 2>,
+            Conv::template process_tile<0, 1, 1, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 3, 2, 2>,
+            Conv::template process_tile<0, 1, 1, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 1, 3, 3, 0>,
+            Conv::template process_tile<0, 1, 1, 3, 3, 1>,
+            Conv::template process_tile<0, 1, 1, 3, 3, 2>,
+            Conv::template process_tile<0, 1, 1, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 4, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 4, 0, 2>,
+            Conv::template process_tile<0, 1, 1, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 4, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 4, 1, 2>,
+            Conv::template process_tile<0, 1, 1, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 4, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 4, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 4, 2, 2>,
+            Conv::template process_tile<0, 1, 1, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 1, 4, 3, 0>,
+            Conv::template process_tile<0, 1, 1, 4, 3, 1>,
+            Conv::template process_tile<0, 1, 1, 4, 3, 2>,
+            Conv::template process_tile<0, 1, 1, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 5, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 5, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 5, 0, 2>,
+            Conv::template process_tile<0, 1, 1, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 5, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 5, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 5, 1, 2>,
+            Conv::template process_tile<0, 1, 1, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 5, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 5, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 5, 2, 2>,
+            Conv::template process_tile<0, 1, 1, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 1, 5, 3, 0>,
+            Conv::template process_tile<0, 1, 1, 5, 3, 1>,
+            Conv::template process_tile<0, 1, 1, 5, 3, 2>,
+            Conv::template process_tile<0, 1, 1, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 1, 6, 0, 0>,
+            Conv::template process_tile<0, 1, 1, 6, 0, 1>,
+            Conv::template process_tile<0, 1, 1, 6, 0, 2>,
+            Conv::template process_tile<0, 1, 1, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 1, 6, 1, 0>,
+            Conv::template process_tile<0, 1, 1, 6, 1, 1>,
+            Conv::template process_tile<0, 1, 1, 6, 1, 2>,
+            Conv::template process_tile<0, 1, 1, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 1, 6, 2, 0>,
+            Conv::template process_tile<0, 1, 1, 6, 2, 1>,
+            Conv::template process_tile<0, 1, 1, 6, 2, 2>,
+            Conv::template process_tile<0, 1, 1, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 1, 6, 3, 0>,
+            Conv::template process_tile<0, 1, 1, 6, 3, 1>,
+            Conv::template process_tile<0, 1, 1, 6, 3, 2>,
+            Conv::template process_tile<0, 1, 1, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 0, 0, 2>,
+            Conv::template process_tile<0, 1, 2, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 0, 1, 2>,
+            Conv::template process_tile<0, 1, 2, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 0, 2, 2>,
+            Conv::template process_tile<0, 1, 2, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 2, 0, 3, 0>,
+            Conv::template process_tile<0, 1, 2, 0, 3, 1>,
+            Conv::template process_tile<0, 1, 2, 0, 3, 2>,
+            Conv::template process_tile<0, 1, 2, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 1, 0, 2>,
+            Conv::template process_tile<0, 1, 2, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 1, 1, 2>,
+            Conv::template process_tile<0, 1, 2, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 1, 2, 2>,
+            Conv::template process_tile<0, 1, 2, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 2, 1, 3, 0>,
+            Conv::template process_tile<0, 1, 2, 1, 3, 1>,
+            Conv::template process_tile<0, 1, 2, 1, 3, 2>,
+            Conv::template process_tile<0, 1, 2, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 2, 0, 2>,
+            Conv::template process_tile<0, 1, 2, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 2, 1, 2>,
+            Conv::template process_tile<0, 1, 2, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 2, 2, 2>,
+            Conv::template process_tile<0, 1, 2, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 2, 2, 3, 0>,
+            Conv::template process_tile<0, 1, 2, 2, 3, 1>,
+            Conv::template process_tile<0, 1, 2, 2, 3, 2>,
+            Conv::template process_tile<0, 1, 2, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 3, 0, 2>,
+            Conv::template process_tile<0, 1, 2, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 3, 1, 2>,
+            Conv::template process_tile<0, 1, 2, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 3, 2, 2>,
+            Conv::template process_tile<0, 1, 2, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 2, 3, 3, 0>,
+            Conv::template process_tile<0, 1, 2, 3, 3, 1>,
+            Conv::template process_tile<0, 1, 2, 3, 3, 2>,
+            Conv::template process_tile<0, 1, 2, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 4, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 4, 0, 2>,
+            Conv::template process_tile<0, 1, 2, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 4, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 4, 1, 2>,
+            Conv::template process_tile<0, 1, 2, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 4, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 4, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 4, 2, 2>,
+            Conv::template process_tile<0, 1, 2, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 2, 4, 3, 0>,
+            Conv::template process_tile<0, 1, 2, 4, 3, 1>,
+            Conv::template process_tile<0, 1, 2, 4, 3, 2>,
+            Conv::template process_tile<0, 1, 2, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 5, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 5, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 5, 0, 2>,
+            Conv::template process_tile<0, 1, 2, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 5, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 5, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 5, 1, 2>,
+            Conv::template process_tile<0, 1, 2, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 5, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 5, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 5, 2, 2>,
+            Conv::template process_tile<0, 1, 2, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 2, 5, 3, 0>,
+            Conv::template process_tile<0, 1, 2, 5, 3, 1>,
+            Conv::template process_tile<0, 1, 2, 5, 3, 2>,
+            Conv::template process_tile<0, 1, 2, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 2, 6, 0, 0>,
+            Conv::template process_tile<0, 1, 2, 6, 0, 1>,
+            Conv::template process_tile<0, 1, 2, 6, 0, 2>,
+            Conv::template process_tile<0, 1, 2, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 2, 6, 1, 0>,
+            Conv::template process_tile<0, 1, 2, 6, 1, 1>,
+            Conv::template process_tile<0, 1, 2, 6, 1, 2>,
+            Conv::template process_tile<0, 1, 2, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 2, 6, 2, 0>,
+            Conv::template process_tile<0, 1, 2, 6, 2, 1>,
+            Conv::template process_tile<0, 1, 2, 6, 2, 2>,
+            Conv::template process_tile<0, 1, 2, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 2, 6, 3, 0>,
+            Conv::template process_tile<0, 1, 2, 6, 3, 1>,
+            Conv::template process_tile<0, 1, 2, 6, 3, 2>,
+            Conv::template process_tile<0, 1, 2, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 0, 0, 2>,
+            Conv::template process_tile<0, 1, 3, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 0, 1, 2>,
+            Conv::template process_tile<0, 1, 3, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 0, 2, 2>,
+            Conv::template process_tile<0, 1, 3, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 3, 0, 3, 0>,
+            Conv::template process_tile<0, 1, 3, 0, 3, 1>,
+            Conv::template process_tile<0, 1, 3, 0, 3, 2>,
+            Conv::template process_tile<0, 1, 3, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 1, 0, 2>,
+            Conv::template process_tile<0, 1, 3, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 1, 1, 2>,
+            Conv::template process_tile<0, 1, 3, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 1, 2, 2>,
+            Conv::template process_tile<0, 1, 3, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 3, 1, 3, 0>,
+            Conv::template process_tile<0, 1, 3, 1, 3, 1>,
+            Conv::template process_tile<0, 1, 3, 1, 3, 2>,
+            Conv::template process_tile<0, 1, 3, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 2, 0, 2>,
+            Conv::template process_tile<0, 1, 3, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 2, 1, 2>,
+            Conv::template process_tile<0, 1, 3, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 2, 2, 2>,
+            Conv::template process_tile<0, 1, 3, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 3, 2, 3, 0>,
+            Conv::template process_tile<0, 1, 3, 2, 3, 1>,
+            Conv::template process_tile<0, 1, 3, 2, 3, 2>,
+            Conv::template process_tile<0, 1, 3, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 3, 0, 2>,
+            Conv::template process_tile<0, 1, 3, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 3, 1, 2>,
+            Conv::template process_tile<0, 1, 3, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 3, 2, 2>,
+            Conv::template process_tile<0, 1, 3, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 3, 3, 3, 0>,
+            Conv::template process_tile<0, 1, 3, 3, 3, 1>,
+            Conv::template process_tile<0, 1, 3, 3, 3, 2>,
+            Conv::template process_tile<0, 1, 3, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 4, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 4, 0, 2>,
+            Conv::template process_tile<0, 1, 3, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 4, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 4, 1, 2>,
+            Conv::template process_tile<0, 1, 3, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 4, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 4, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 4, 2, 2>,
+            Conv::template process_tile<0, 1, 3, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 3, 4, 3, 0>,
+            Conv::template process_tile<0, 1, 3, 4, 3, 1>,
+            Conv::template process_tile<0, 1, 3, 4, 3, 2>,
+            Conv::template process_tile<0, 1, 3, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 5, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 5, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 5, 0, 2>,
+            Conv::template process_tile<0, 1, 3, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 5, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 5, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 5, 1, 2>,
+            Conv::template process_tile<0, 1, 3, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 5, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 5, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 5, 2, 2>,
+            Conv::template process_tile<0, 1, 3, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 3, 5, 3, 0>,
+            Conv::template process_tile<0, 1, 3, 5, 3, 1>,
+            Conv::template process_tile<0, 1, 3, 5, 3, 2>,
+            Conv::template process_tile<0, 1, 3, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 3, 6, 0, 0>,
+            Conv::template process_tile<0, 1, 3, 6, 0, 1>,
+            Conv::template process_tile<0, 1, 3, 6, 0, 2>,
+            Conv::template process_tile<0, 1, 3, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 3, 6, 1, 0>,
+            Conv::template process_tile<0, 1, 3, 6, 1, 1>,
+            Conv::template process_tile<0, 1, 3, 6, 1, 2>,
+            Conv::template process_tile<0, 1, 3, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 3, 6, 2, 0>,
+            Conv::template process_tile<0, 1, 3, 6, 2, 1>,
+            Conv::template process_tile<0, 1, 3, 6, 2, 2>,
+            Conv::template process_tile<0, 1, 3, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 3, 6, 3, 0>,
+            Conv::template process_tile<0, 1, 3, 6, 3, 1>,
+            Conv::template process_tile<0, 1, 3, 6, 3, 2>,
+            Conv::template process_tile<0, 1, 3, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 3
+      {  // Input pad bottom = 4
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 4, 0, 0, 2>,
+            Conv::template process_tile<0, 1, 4, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 4, 0, 1, 2>,
+            Conv::template process_tile<0, 1, 4, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 4, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 4, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 4, 0, 2, 2>,
+            Conv::template process_tile<0, 1, 4, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 4, 0, 3, 0>,
+            Conv::template process_tile<0, 1, 4, 0, 3, 1>,
+            Conv::template process_tile<0, 1, 4, 0, 3, 2>,
+            Conv::template process_tile<0, 1, 4, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 4, 1, 0, 2>,
+            Conv::template process_tile<0, 1, 4, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 4, 1, 1, 2>,
+            Conv::template process_tile<0, 1, 4, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 4, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 4, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 4, 1, 2, 2>,
+            Conv::template process_tile<0, 1, 4, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 4, 1, 3, 0>,
+            Conv::template process_tile<0, 1, 4, 1, 3, 1>,
+            Conv::template process_tile<0, 1, 4, 1, 3, 2>,
+            Conv::template process_tile<0, 1, 4, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 4, 2, 0, 2>,
+            Conv::template process_tile<0, 1, 4, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 4, 2, 1, 2>,
+            Conv::template process_tile<0, 1, 4, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 4, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 4, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 4, 2, 2, 2>,
+            Conv::template process_tile<0, 1, 4, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 4, 2, 3, 0>,
+            Conv::template process_tile<0, 1, 4, 2, 3, 1>,
+            Conv::template process_tile<0, 1, 4, 2, 3, 2>,
+            Conv::template process_tile<0, 1, 4, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 4, 3, 0, 2>,
+            Conv::template process_tile<0, 1, 4, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 4, 3, 1, 2>,
+            Conv::template process_tile<0, 1, 4, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 4, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 4, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 4, 3, 2, 2>,
+            Conv::template process_tile<0, 1, 4, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 4, 3, 3, 0>,
+            Conv::template process_tile<0, 1, 4, 3, 3, 1>,
+            Conv::template process_tile<0, 1, 4, 3, 3, 2>,
+            Conv::template process_tile<0, 1, 4, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 4, 0, 1>,
+            Conv::template process_tile<0, 1, 4, 4, 0, 2>,
+            Conv::template process_tile<0, 1, 4, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 4, 1, 1>,
+            Conv::template process_tile<0, 1, 4, 4, 1, 2>,
+            Conv::template process_tile<0, 1, 4, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 4, 4, 2, 0>,
+            Conv::template process_tile<0, 1, 4, 4, 2, 1>,
+            Conv::template process_tile<0, 1, 4, 4, 2, 2>,
+            Conv::template process_tile<0, 1, 4, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 4, 4, 3, 0>,
+            Conv::template process_tile<0, 1, 4, 4, 3, 1>,
+            Conv::template process_tile<0, 1, 4, 4, 3, 2>,
+            Conv::template process_tile<0, 1, 4, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 5, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 5, 0, 1>,
+            Conv::template process_tile<0, 1, 4, 5, 0, 2>,
+            Conv::template process_tile<0, 1, 4, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 5, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 5, 1, 1>,
+            Conv::template process_tile<0, 1, 4, 5, 1, 2>,
+            Conv::template process_tile<0, 1, 4, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 4, 5, 2, 0>,
+            Conv::template process_tile<0, 1, 4, 5, 2, 1>,
+            Conv::template process_tile<0, 1, 4, 5, 2, 2>,
+            Conv::template process_tile<0, 1, 4, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 4, 5, 3, 0>,
+            Conv::template process_tile<0, 1, 4, 5, 3, 1>,
+            Conv::template process_tile<0, 1, 4, 5, 3, 2>,
+            Conv::template process_tile<0, 1, 4, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 4, 6, 0, 0>,
+            Conv::template process_tile<0, 1, 4, 6, 0, 1>,
+            Conv::template process_tile<0, 1, 4, 6, 0, 2>,
+            Conv::template process_tile<0, 1, 4, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 4, 6, 1, 0>,
+            Conv::template process_tile<0, 1, 4, 6, 1, 1>,
+            Conv::template process_tile<0, 1, 4, 6, 1, 2>,
+            Conv::template process_tile<0, 1, 4, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 4, 6, 2, 0>,
+            Conv::template process_tile<0, 1, 4, 6, 2, 1>,
+            Conv::template process_tile<0, 1, 4, 6, 2, 2>,
+            Conv::template process_tile<0, 1, 4, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 4, 6, 3, 0>,
+            Conv::template process_tile<0, 1, 4, 6, 3, 1>,
+            Conv::template process_tile<0, 1, 4, 6, 3, 2>,
+            Conv::template process_tile<0, 1, 4, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 4
+      {  // Input pad bottom = 5
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 5, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 5, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 5, 0, 0, 2>,
+            Conv::template process_tile<0, 1, 5, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 5, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 5, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 5, 0, 1, 2>,
+            Conv::template process_tile<0, 1, 5, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 5, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 5, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 5, 0, 2, 2>,
+            Conv::template process_tile<0, 1, 5, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 5, 0, 3, 0>,
+            Conv::template process_tile<0, 1, 5, 0, 3, 1>,
+            Conv::template process_tile<0, 1, 5, 0, 3, 2>,
+            Conv::template process_tile<0, 1, 5, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 5, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 5, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 5, 1, 0, 2>,
+            Conv::template process_tile<0, 1, 5, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 5, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 5, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 5, 1, 1, 2>,
+            Conv::template process_tile<0, 1, 5, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 5, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 5, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 5, 1, 2, 2>,
+            Conv::template process_tile<0, 1, 5, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 5, 1, 3, 0>,
+            Conv::template process_tile<0, 1, 5, 1, 3, 1>,
+            Conv::template process_tile<0, 1, 5, 1, 3, 2>,
+            Conv::template process_tile<0, 1, 5, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 5, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 5, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 5, 2, 0, 2>,
+            Conv::template process_tile<0, 1, 5, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 5, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 5, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 5, 2, 1, 2>,
+            Conv::template process_tile<0, 1, 5, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 5, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 5, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 5, 2, 2, 2>,
+            Conv::template process_tile<0, 1, 5, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 5, 2, 3, 0>,
+            Conv::template process_tile<0, 1, 5, 2, 3, 1>,
+            Conv::template process_tile<0, 1, 5, 2, 3, 2>,
+            Conv::template process_tile<0, 1, 5, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 5, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 5, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 5, 3, 0, 2>,
+            Conv::template process_tile<0, 1, 5, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 5, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 5, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 5, 3, 1, 2>,
+            Conv::template process_tile<0, 1, 5, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 5, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 5, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 5, 3, 2, 2>,
+            Conv::template process_tile<0, 1, 5, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 5, 3, 3, 0>,
+            Conv::template process_tile<0, 1, 5, 3, 3, 1>,
+            Conv::template process_tile<0, 1, 5, 3, 3, 2>,
+            Conv::template process_tile<0, 1, 5, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 5, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 5, 4, 0, 1>,
+            Conv::template process_tile<0, 1, 5, 4, 0, 2>,
+            Conv::template process_tile<0, 1, 5, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 5, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 5, 4, 1, 1>,
+            Conv::template process_tile<0, 1, 5, 4, 1, 2>,
+            Conv::template process_tile<0, 1, 5, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 5, 4, 2, 0>,
+            Conv::template process_tile<0, 1, 5, 4, 2, 1>,
+            Conv::template process_tile<0, 1, 5, 4, 2, 2>,
+            Conv::template process_tile<0, 1, 5, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 5, 4, 3, 0>,
+            Conv::template process_tile<0, 1, 5, 4, 3, 1>,
+            Conv::template process_tile<0, 1, 5, 4, 3, 2>,
+            Conv::template process_tile<0, 1, 5, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 5, 5, 0, 0>,
+            Conv::template process_tile<0, 1, 5, 5, 0, 1>,
+            Conv::template process_tile<0, 1, 5, 5, 0, 2>,
+            Conv::template process_tile<0, 1, 5, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 5, 5, 1, 0>,
+            Conv::template process_tile<0, 1, 5, 5, 1, 1>,
+            Conv::template process_tile<0, 1, 5, 5, 1, 2>,
+            Conv::template process_tile<0, 1, 5, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 5, 5, 2, 0>,
+            Conv::template process_tile<0, 1, 5, 5, 2, 1>,
+            Conv::template process_tile<0, 1, 5, 5, 2, 2>,
+            Conv::template process_tile<0, 1, 5, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 5, 5, 3, 0>,
+            Conv::template process_tile<0, 1, 5, 5, 3, 1>,
+            Conv::template process_tile<0, 1, 5, 5, 3, 2>,
+            Conv::template process_tile<0, 1, 5, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 5, 6, 0, 0>,
+            Conv::template process_tile<0, 1, 5, 6, 0, 1>,
+            Conv::template process_tile<0, 1, 5, 6, 0, 2>,
+            Conv::template process_tile<0, 1, 5, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 5, 6, 1, 0>,
+            Conv::template process_tile<0, 1, 5, 6, 1, 1>,
+            Conv::template process_tile<0, 1, 5, 6, 1, 2>,
+            Conv::template process_tile<0, 1, 5, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 5, 6, 2, 0>,
+            Conv::template process_tile<0, 1, 5, 6, 2, 1>,
+            Conv::template process_tile<0, 1, 5, 6, 2, 2>,
+            Conv::template process_tile<0, 1, 5, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 5, 6, 3, 0>,
+            Conv::template process_tile<0, 1, 5, 6, 3, 1>,
+            Conv::template process_tile<0, 1, 5, 6, 3, 2>,
+            Conv::template process_tile<0, 1, 5, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 5
+      {  // Input pad bottom = 6
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 6, 0, 0, 0>,
+            Conv::template process_tile<0, 1, 6, 0, 0, 1>,
+            Conv::template process_tile<0, 1, 6, 0, 0, 2>,
+            Conv::template process_tile<0, 1, 6, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 6, 0, 1, 0>,
+            Conv::template process_tile<0, 1, 6, 0, 1, 1>,
+            Conv::template process_tile<0, 1, 6, 0, 1, 2>,
+            Conv::template process_tile<0, 1, 6, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 6, 0, 2, 0>,
+            Conv::template process_tile<0, 1, 6, 0, 2, 1>,
+            Conv::template process_tile<0, 1, 6, 0, 2, 2>,
+            Conv::template process_tile<0, 1, 6, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 6, 0, 3, 0>,
+            Conv::template process_tile<0, 1, 6, 0, 3, 1>,
+            Conv::template process_tile<0, 1, 6, 0, 3, 2>,
+            Conv::template process_tile<0, 1, 6, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 6, 1, 0, 0>,
+            Conv::template process_tile<0, 1, 6, 1, 0, 1>,
+            Conv::template process_tile<0, 1, 6, 1, 0, 2>,
+            Conv::template process_tile<0, 1, 6, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 6, 1, 1, 0>,
+            Conv::template process_tile<0, 1, 6, 1, 1, 1>,
+            Conv::template process_tile<0, 1, 6, 1, 1, 2>,
+            Conv::template process_tile<0, 1, 6, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 6, 1, 2, 0>,
+            Conv::template process_tile<0, 1, 6, 1, 2, 1>,
+            Conv::template process_tile<0, 1, 6, 1, 2, 2>,
+            Conv::template process_tile<0, 1, 6, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 6, 1, 3, 0>,
+            Conv::template process_tile<0, 1, 6, 1, 3, 1>,
+            Conv::template process_tile<0, 1, 6, 1, 3, 2>,
+            Conv::template process_tile<0, 1, 6, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 6, 2, 0, 0>,
+            Conv::template process_tile<0, 1, 6, 2, 0, 1>,
+            Conv::template process_tile<0, 1, 6, 2, 0, 2>,
+            Conv::template process_tile<0, 1, 6, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 6, 2, 1, 0>,
+            Conv::template process_tile<0, 1, 6, 2, 1, 1>,
+            Conv::template process_tile<0, 1, 6, 2, 1, 2>,
+            Conv::template process_tile<0, 1, 6, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 6, 2, 2, 0>,
+            Conv::template process_tile<0, 1, 6, 2, 2, 1>,
+            Conv::template process_tile<0, 1, 6, 2, 2, 2>,
+            Conv::template process_tile<0, 1, 6, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 6, 2, 3, 0>,
+            Conv::template process_tile<0, 1, 6, 2, 3, 1>,
+            Conv::template process_tile<0, 1, 6, 2, 3, 2>,
+            Conv::template process_tile<0, 1, 6, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 6, 3, 0, 0>,
+            Conv::template process_tile<0, 1, 6, 3, 0, 1>,
+            Conv::template process_tile<0, 1, 6, 3, 0, 2>,
+            Conv::template process_tile<0, 1, 6, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 6, 3, 1, 0>,
+            Conv::template process_tile<0, 1, 6, 3, 1, 1>,
+            Conv::template process_tile<0, 1, 6, 3, 1, 2>,
+            Conv::template process_tile<0, 1, 6, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 6, 3, 2, 0>,
+            Conv::template process_tile<0, 1, 6, 3, 2, 1>,
+            Conv::template process_tile<0, 1, 6, 3, 2, 2>,
+            Conv::template process_tile<0, 1, 6, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 6, 3, 3, 0>,
+            Conv::template process_tile<0, 1, 6, 3, 3, 1>,
+            Conv::template process_tile<0, 1, 6, 3, 3, 2>,
+            Conv::template process_tile<0, 1, 6, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 6, 4, 0, 0>,
+            Conv::template process_tile<0, 1, 6, 4, 0, 1>,
+            Conv::template process_tile<0, 1, 6, 4, 0, 2>,
+            Conv::template process_tile<0, 1, 6, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 6, 4, 1, 0>,
+            Conv::template process_tile<0, 1, 6, 4, 1, 1>,
+            Conv::template process_tile<0, 1, 6, 4, 1, 2>,
+            Conv::template process_tile<0, 1, 6, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 6, 4, 2, 0>,
+            Conv::template process_tile<0, 1, 6, 4, 2, 1>,
+            Conv::template process_tile<0, 1, 6, 4, 2, 2>,
+            Conv::template process_tile<0, 1, 6, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 6, 4, 3, 0>,
+            Conv::template process_tile<0, 1, 6, 4, 3, 1>,
+            Conv::template process_tile<0, 1, 6, 4, 3, 2>,
+            Conv::template process_tile<0, 1, 6, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 6, 5, 0, 0>,
+            Conv::template process_tile<0, 1, 6, 5, 0, 1>,
+            Conv::template process_tile<0, 1, 6, 5, 0, 2>,
+            Conv::template process_tile<0, 1, 6, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 6, 5, 1, 0>,
+            Conv::template process_tile<0, 1, 6, 5, 1, 1>,
+            Conv::template process_tile<0, 1, 6, 5, 1, 2>,
+            Conv::template process_tile<0, 1, 6, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 6, 5, 2, 0>,
+            Conv::template process_tile<0, 1, 6, 5, 2, 1>,
+            Conv::template process_tile<0, 1, 6, 5, 2, 2>,
+            Conv::template process_tile<0, 1, 6, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 6, 5, 3, 0>,
+            Conv::template process_tile<0, 1, 6, 5, 3, 1>,
+            Conv::template process_tile<0, 1, 6, 5, 3, 2>,
+            Conv::template process_tile<0, 1, 6, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<0, 1, 6, 6, 0, 0>,
+            Conv::template process_tile<0, 1, 6, 6, 0, 1>,
+            Conv::template process_tile<0, 1, 6, 6, 0, 2>,
+            Conv::template process_tile<0, 1, 6, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<0, 1, 6, 6, 1, 0>,
+            Conv::template process_tile<0, 1, 6, 6, 1, 1>,
+            Conv::template process_tile<0, 1, 6, 6, 1, 2>,
+            Conv::template process_tile<0, 1, 6, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<0, 1, 6, 6, 2, 0>,
+            Conv::template process_tile<0, 1, 6, 6, 2, 1>,
+            Conv::template process_tile<0, 1, 6, 6, 2, 2>,
+            Conv::template process_tile<0, 1, 6, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<0, 1, 6, 6, 3, 0>,
+            Conv::template process_tile<0, 1, 6, 6, 3, 1>,
+            Conv::template process_tile<0, 1, 6, 6, 3, 2>,
+            Conv::template process_tile<0, 1, 6, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 6
+    },  // Input pad left = 1
+  },  // Input pad top = 0
+  {  // Input pad top = 1
+    {  // Input pad left = 0
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 0, 0, 2>,
+            Conv::template process_tile<1, 0, 0, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 0, 1, 2>,
+            Conv::template process_tile<1, 0, 0, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 0, 2, 2>,
+            Conv::template process_tile<1, 0, 0, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 0, 0, 3, 0>,
+            Conv::template process_tile<1, 0, 0, 0, 3, 1>,
+            Conv::template process_tile<1, 0, 0, 0, 3, 2>,
+            Conv::template process_tile<1, 0, 0, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 1, 0, 2>,
+            Conv::template process_tile<1, 0, 0, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 1, 1, 2>,
+            Conv::template process_tile<1, 0, 0, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 1, 2, 2>,
+            Conv::template process_tile<1, 0, 0, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 0, 1, 3, 0>,
+            Conv::template process_tile<1, 0, 0, 1, 3, 1>,
+            Conv::template process_tile<1, 0, 0, 1, 3, 2>,
+            Conv::template process_tile<1, 0, 0, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 2, 0, 2>,
+            Conv::template process_tile<1, 0, 0, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 2, 1, 2>,
+            Conv::template process_tile<1, 0, 0, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 2, 2, 2>,
+            Conv::template process_tile<1, 0, 0, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 0, 2, 3, 0>,
+            Conv::template process_tile<1, 0, 0, 2, 3, 1>,
+            Conv::template process_tile<1, 0, 0, 2, 3, 2>,
+            Conv::template process_tile<1, 0, 0, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 3, 0, 2>,
+            Conv::template process_tile<1, 0, 0, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 3, 1, 2>,
+            Conv::template process_tile<1, 0, 0, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 3, 2, 2>,
+            Conv::template process_tile<1, 0, 0, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 0, 3, 3, 0>,
+            Conv::template process_tile<1, 0, 0, 3, 3, 1>,
+            Conv::template process_tile<1, 0, 0, 3, 3, 2>,
+            Conv::template process_tile<1, 0, 0, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 4, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 4, 0, 2>,
+            Conv::template process_tile<1, 0, 0, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 4, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 4, 1, 2>,
+            Conv::template process_tile<1, 0, 0, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 4, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 4, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 4, 2, 2>,
+            Conv::template process_tile<1, 0, 0, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 0, 4, 3, 0>,
+            Conv::template process_tile<1, 0, 0, 4, 3, 1>,
+            Conv::template process_tile<1, 0, 0, 4, 3, 2>,
+            Conv::template process_tile<1, 0, 0, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 5, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 5, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 5, 0, 2>,
+            Conv::template process_tile<1, 0, 0, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 5, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 5, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 5, 1, 2>,
+            Conv::template process_tile<1, 0, 0, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 5, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 5, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 5, 2, 2>,
+            Conv::template process_tile<1, 0, 0, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 0, 5, 3, 0>,
+            Conv::template process_tile<1, 0, 0, 5, 3, 1>,
+            Conv::template process_tile<1, 0, 0, 5, 3, 2>,
+            Conv::template process_tile<1, 0, 0, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 0, 6, 0, 0>,
+            Conv::template process_tile<1, 0, 0, 6, 0, 1>,
+            Conv::template process_tile<1, 0, 0, 6, 0, 2>,
+            Conv::template process_tile<1, 0, 0, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 0, 6, 1, 0>,
+            Conv::template process_tile<1, 0, 0, 6, 1, 1>,
+            Conv::template process_tile<1, 0, 0, 6, 1, 2>,
+            Conv::template process_tile<1, 0, 0, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 0, 6, 2, 0>,
+            Conv::template process_tile<1, 0, 0, 6, 2, 1>,
+            Conv::template process_tile<1, 0, 0, 6, 2, 2>,
+            Conv::template process_tile<1, 0, 0, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 0, 6, 3, 0>,
+            Conv::template process_tile<1, 0, 0, 6, 3, 1>,
+            Conv::template process_tile<1, 0, 0, 6, 3, 2>,
+            Conv::template process_tile<1, 0, 0, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 0, 0, 2>,
+            Conv::template process_tile<1, 0, 1, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 0, 1, 2>,
+            Conv::template process_tile<1, 0, 1, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 0, 2, 2>,
+            Conv::template process_tile<1, 0, 1, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 1, 0, 3, 0>,
+            Conv::template process_tile<1, 0, 1, 0, 3, 1>,
+            Conv::template process_tile<1, 0, 1, 0, 3, 2>,
+            Conv::template process_tile<1, 0, 1, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 1, 0, 2>,
+            Conv::template process_tile<1, 0, 1, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 1, 1, 2>,
+            Conv::template process_tile<1, 0, 1, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 1, 2, 2>,
+            Conv::template process_tile<1, 0, 1, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 1, 1, 3, 0>,
+            Conv::template process_tile<1, 0, 1, 1, 3, 1>,
+            Conv::template process_tile<1, 0, 1, 1, 3, 2>,
+            Conv::template process_tile<1, 0, 1, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 2, 0, 2>,
+            Conv::template process_tile<1, 0, 1, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 2, 1, 2>,
+            Conv::template process_tile<1, 0, 1, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 2, 2, 2>,
+            Conv::template process_tile<1, 0, 1, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 1, 2, 3, 0>,
+            Conv::template process_tile<1, 0, 1, 2, 3, 1>,
+            Conv::template process_tile<1, 0, 1, 2, 3, 2>,
+            Conv::template process_tile<1, 0, 1, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 3, 0, 2>,
+            Conv::template process_tile<1, 0, 1, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 3, 1, 2>,
+            Conv::template process_tile<1, 0, 1, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 3, 2, 2>,
+            Conv::template process_tile<1, 0, 1, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 1, 3, 3, 0>,
+            Conv::template process_tile<1, 0, 1, 3, 3, 1>,
+            Conv::template process_tile<1, 0, 1, 3, 3, 2>,
+            Conv::template process_tile<1, 0, 1, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 4, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 4, 0, 2>,
+            Conv::template process_tile<1, 0, 1, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 4, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 4, 1, 2>,
+            Conv::template process_tile<1, 0, 1, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 4, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 4, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 4, 2, 2>,
+            Conv::template process_tile<1, 0, 1, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 1, 4, 3, 0>,
+            Conv::template process_tile<1, 0, 1, 4, 3, 1>,
+            Conv::template process_tile<1, 0, 1, 4, 3, 2>,
+            Conv::template process_tile<1, 0, 1, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 5, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 5, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 5, 0, 2>,
+            Conv::template process_tile<1, 0, 1, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 5, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 5, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 5, 1, 2>,
+            Conv::template process_tile<1, 0, 1, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 5, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 5, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 5, 2, 2>,
+            Conv::template process_tile<1, 0, 1, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 1, 5, 3, 0>,
+            Conv::template process_tile<1, 0, 1, 5, 3, 1>,
+            Conv::template process_tile<1, 0, 1, 5, 3, 2>,
+            Conv::template process_tile<1, 0, 1, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 1, 6, 0, 0>,
+            Conv::template process_tile<1, 0, 1, 6, 0, 1>,
+            Conv::template process_tile<1, 0, 1, 6, 0, 2>,
+            Conv::template process_tile<1, 0, 1, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 1, 6, 1, 0>,
+            Conv::template process_tile<1, 0, 1, 6, 1, 1>,
+            Conv::template process_tile<1, 0, 1, 6, 1, 2>,
+            Conv::template process_tile<1, 0, 1, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 1, 6, 2, 0>,
+            Conv::template process_tile<1, 0, 1, 6, 2, 1>,
+            Conv::template process_tile<1, 0, 1, 6, 2, 2>,
+            Conv::template process_tile<1, 0, 1, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 1, 6, 3, 0>,
+            Conv::template process_tile<1, 0, 1, 6, 3, 1>,
+            Conv::template process_tile<1, 0, 1, 6, 3, 2>,
+            Conv::template process_tile<1, 0, 1, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 0, 0, 2>,
+            Conv::template process_tile<1, 0, 2, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 0, 1, 2>,
+            Conv::template process_tile<1, 0, 2, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 0, 2, 2>,
+            Conv::template process_tile<1, 0, 2, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 2, 0, 3, 0>,
+            Conv::template process_tile<1, 0, 2, 0, 3, 1>,
+            Conv::template process_tile<1, 0, 2, 0, 3, 2>,
+            Conv::template process_tile<1, 0, 2, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 1, 0, 2>,
+            Conv::template process_tile<1, 0, 2, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 1, 1, 2>,
+            Conv::template process_tile<1, 0, 2, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 1, 2, 2>,
+            Conv::template process_tile<1, 0, 2, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 2, 1, 3, 0>,
+            Conv::template process_tile<1, 0, 2, 1, 3, 1>,
+            Conv::template process_tile<1, 0, 2, 1, 3, 2>,
+            Conv::template process_tile<1, 0, 2, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 2, 0, 2>,
+            Conv::template process_tile<1, 0, 2, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 2, 1, 2>,
+            Conv::template process_tile<1, 0, 2, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 2, 2, 2>,
+            Conv::template process_tile<1, 0, 2, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 2, 2, 3, 0>,
+            Conv::template process_tile<1, 0, 2, 2, 3, 1>,
+            Conv::template process_tile<1, 0, 2, 2, 3, 2>,
+            Conv::template process_tile<1, 0, 2, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 3, 0, 2>,
+            Conv::template process_tile<1, 0, 2, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 3, 1, 2>,
+            Conv::template process_tile<1, 0, 2, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 3, 2, 2>,
+            Conv::template process_tile<1, 0, 2, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 2, 3, 3, 0>,
+            Conv::template process_tile<1, 0, 2, 3, 3, 1>,
+            Conv::template process_tile<1, 0, 2, 3, 3, 2>,
+            Conv::template process_tile<1, 0, 2, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 4, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 4, 0, 2>,
+            Conv::template process_tile<1, 0, 2, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 4, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 4, 1, 2>,
+            Conv::template process_tile<1, 0, 2, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 4, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 4, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 4, 2, 2>,
+            Conv::template process_tile<1, 0, 2, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 2, 4, 3, 0>,
+            Conv::template process_tile<1, 0, 2, 4, 3, 1>,
+            Conv::template process_tile<1, 0, 2, 4, 3, 2>,
+            Conv::template process_tile<1, 0, 2, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 5, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 5, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 5, 0, 2>,
+            Conv::template process_tile<1, 0, 2, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 5, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 5, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 5, 1, 2>,
+            Conv::template process_tile<1, 0, 2, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 5, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 5, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 5, 2, 2>,
+            Conv::template process_tile<1, 0, 2, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 2, 5, 3, 0>,
+            Conv::template process_tile<1, 0, 2, 5, 3, 1>,
+            Conv::template process_tile<1, 0, 2, 5, 3, 2>,
+            Conv::template process_tile<1, 0, 2, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 2, 6, 0, 0>,
+            Conv::template process_tile<1, 0, 2, 6, 0, 1>,
+            Conv::template process_tile<1, 0, 2, 6, 0, 2>,
+            Conv::template process_tile<1, 0, 2, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 2, 6, 1, 0>,
+            Conv::template process_tile<1, 0, 2, 6, 1, 1>,
+            Conv::template process_tile<1, 0, 2, 6, 1, 2>,
+            Conv::template process_tile<1, 0, 2, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 2, 6, 2, 0>,
+            Conv::template process_tile<1, 0, 2, 6, 2, 1>,
+            Conv::template process_tile<1, 0, 2, 6, 2, 2>,
+            Conv::template process_tile<1, 0, 2, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 2, 6, 3, 0>,
+            Conv::template process_tile<1, 0, 2, 6, 3, 1>,
+            Conv::template process_tile<1, 0, 2, 6, 3, 2>,
+            Conv::template process_tile<1, 0, 2, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 0, 0, 2>,
+            Conv::template process_tile<1, 0, 3, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 0, 1, 2>,
+            Conv::template process_tile<1, 0, 3, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 0, 2, 2>,
+            Conv::template process_tile<1, 0, 3, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 3, 0, 3, 0>,
+            Conv::template process_tile<1, 0, 3, 0, 3, 1>,
+            Conv::template process_tile<1, 0, 3, 0, 3, 2>,
+            Conv::template process_tile<1, 0, 3, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 1, 0, 2>,
+            Conv::template process_tile<1, 0, 3, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 1, 1, 2>,
+            Conv::template process_tile<1, 0, 3, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 1, 2, 2>,
+            Conv::template process_tile<1, 0, 3, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 3, 1, 3, 0>,
+            Conv::template process_tile<1, 0, 3, 1, 3, 1>,
+            Conv::template process_tile<1, 0, 3, 1, 3, 2>,
+            Conv::template process_tile<1, 0, 3, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 2, 0, 2>,
+            Conv::template process_tile<1, 0, 3, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 2, 1, 2>,
+            Conv::template process_tile<1, 0, 3, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 2, 2, 2>,
+            Conv::template process_tile<1, 0, 3, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 3, 2, 3, 0>,
+            Conv::template process_tile<1, 0, 3, 2, 3, 1>,
+            Conv::template process_tile<1, 0, 3, 2, 3, 2>,
+            Conv::template process_tile<1, 0, 3, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 3, 0, 2>,
+            Conv::template process_tile<1, 0, 3, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 3, 1, 2>,
+            Conv::template process_tile<1, 0, 3, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 3, 2, 2>,
+            Conv::template process_tile<1, 0, 3, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 3, 3, 3, 0>,
+            Conv::template process_tile<1, 0, 3, 3, 3, 1>,
+            Conv::template process_tile<1, 0, 3, 3, 3, 2>,
+            Conv::template process_tile<1, 0, 3, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 4, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 4, 0, 2>,
+            Conv::template process_tile<1, 0, 3, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 4, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 4, 1, 2>,
+            Conv::template process_tile<1, 0, 3, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 4, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 4, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 4, 2, 2>,
+            Conv::template process_tile<1, 0, 3, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 3, 4, 3, 0>,
+            Conv::template process_tile<1, 0, 3, 4, 3, 1>,
+            Conv::template process_tile<1, 0, 3, 4, 3, 2>,
+            Conv::template process_tile<1, 0, 3, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 5, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 5, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 5, 0, 2>,
+            Conv::template process_tile<1, 0, 3, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 5, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 5, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 5, 1, 2>,
+            Conv::template process_tile<1, 0, 3, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 5, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 5, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 5, 2, 2>,
+            Conv::template process_tile<1, 0, 3, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 3, 5, 3, 0>,
+            Conv::template process_tile<1, 0, 3, 5, 3, 1>,
+            Conv::template process_tile<1, 0, 3, 5, 3, 2>,
+            Conv::template process_tile<1, 0, 3, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 3, 6, 0, 0>,
+            Conv::template process_tile<1, 0, 3, 6, 0, 1>,
+            Conv::template process_tile<1, 0, 3, 6, 0, 2>,
+            Conv::template process_tile<1, 0, 3, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 3, 6, 1, 0>,
+            Conv::template process_tile<1, 0, 3, 6, 1, 1>,
+            Conv::template process_tile<1, 0, 3, 6, 1, 2>,
+            Conv::template process_tile<1, 0, 3, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 3, 6, 2, 0>,
+            Conv::template process_tile<1, 0, 3, 6, 2, 1>,
+            Conv::template process_tile<1, 0, 3, 6, 2, 2>,
+            Conv::template process_tile<1, 0, 3, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 3, 6, 3, 0>,
+            Conv::template process_tile<1, 0, 3, 6, 3, 1>,
+            Conv::template process_tile<1, 0, 3, 6, 3, 2>,
+            Conv::template process_tile<1, 0, 3, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 3
+      {  // Input pad bottom = 4
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 4, 0, 0, 2>,
+            Conv::template process_tile<1, 0, 4, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 4, 0, 1, 2>,
+            Conv::template process_tile<1, 0, 4, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 4, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 4, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 4, 0, 2, 2>,
+            Conv::template process_tile<1, 0, 4, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 4, 0, 3, 0>,
+            Conv::template process_tile<1, 0, 4, 0, 3, 1>,
+            Conv::template process_tile<1, 0, 4, 0, 3, 2>,
+            Conv::template process_tile<1, 0, 4, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 4, 1, 0, 2>,
+            Conv::template process_tile<1, 0, 4, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 4, 1, 1, 2>,
+            Conv::template process_tile<1, 0, 4, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 4, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 4, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 4, 1, 2, 2>,
+            Conv::template process_tile<1, 0, 4, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 4, 1, 3, 0>,
+            Conv::template process_tile<1, 0, 4, 1, 3, 1>,
+            Conv::template process_tile<1, 0, 4, 1, 3, 2>,
+            Conv::template process_tile<1, 0, 4, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 4, 2, 0, 2>,
+            Conv::template process_tile<1, 0, 4, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 4, 2, 1, 2>,
+            Conv::template process_tile<1, 0, 4, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 4, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 4, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 4, 2, 2, 2>,
+            Conv::template process_tile<1, 0, 4, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 4, 2, 3, 0>,
+            Conv::template process_tile<1, 0, 4, 2, 3, 1>,
+            Conv::template process_tile<1, 0, 4, 2, 3, 2>,
+            Conv::template process_tile<1, 0, 4, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 4, 3, 0, 2>,
+            Conv::template process_tile<1, 0, 4, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 4, 3, 1, 2>,
+            Conv::template process_tile<1, 0, 4, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 4, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 4, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 4, 3, 2, 2>,
+            Conv::template process_tile<1, 0, 4, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 4, 3, 3, 0>,
+            Conv::template process_tile<1, 0, 4, 3, 3, 1>,
+            Conv::template process_tile<1, 0, 4, 3, 3, 2>,
+            Conv::template process_tile<1, 0, 4, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 4, 0, 1>,
+            Conv::template process_tile<1, 0, 4, 4, 0, 2>,
+            Conv::template process_tile<1, 0, 4, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 4, 1, 1>,
+            Conv::template process_tile<1, 0, 4, 4, 1, 2>,
+            Conv::template process_tile<1, 0, 4, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 4, 4, 2, 0>,
+            Conv::template process_tile<1, 0, 4, 4, 2, 1>,
+            Conv::template process_tile<1, 0, 4, 4, 2, 2>,
+            Conv::template process_tile<1, 0, 4, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 4, 4, 3, 0>,
+            Conv::template process_tile<1, 0, 4, 4, 3, 1>,
+            Conv::template process_tile<1, 0, 4, 4, 3, 2>,
+            Conv::template process_tile<1, 0, 4, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 5, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 5, 0, 1>,
+            Conv::template process_tile<1, 0, 4, 5, 0, 2>,
+            Conv::template process_tile<1, 0, 4, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 5, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 5, 1, 1>,
+            Conv::template process_tile<1, 0, 4, 5, 1, 2>,
+            Conv::template process_tile<1, 0, 4, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 4, 5, 2, 0>,
+            Conv::template process_tile<1, 0, 4, 5, 2, 1>,
+            Conv::template process_tile<1, 0, 4, 5, 2, 2>,
+            Conv::template process_tile<1, 0, 4, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 4, 5, 3, 0>,
+            Conv::template process_tile<1, 0, 4, 5, 3, 1>,
+            Conv::template process_tile<1, 0, 4, 5, 3, 2>,
+            Conv::template process_tile<1, 0, 4, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 4, 6, 0, 0>,
+            Conv::template process_tile<1, 0, 4, 6, 0, 1>,
+            Conv::template process_tile<1, 0, 4, 6, 0, 2>,
+            Conv::template process_tile<1, 0, 4, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 4, 6, 1, 0>,
+            Conv::template process_tile<1, 0, 4, 6, 1, 1>,
+            Conv::template process_tile<1, 0, 4, 6, 1, 2>,
+            Conv::template process_tile<1, 0, 4, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 4, 6, 2, 0>,
+            Conv::template process_tile<1, 0, 4, 6, 2, 1>,
+            Conv::template process_tile<1, 0, 4, 6, 2, 2>,
+            Conv::template process_tile<1, 0, 4, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 4, 6, 3, 0>,
+            Conv::template process_tile<1, 0, 4, 6, 3, 1>,
+            Conv::template process_tile<1, 0, 4, 6, 3, 2>,
+            Conv::template process_tile<1, 0, 4, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 4
+      {  // Input pad bottom = 5
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 5, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 5, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 5, 0, 0, 2>,
+            Conv::template process_tile<1, 0, 5, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 5, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 5, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 5, 0, 1, 2>,
+            Conv::template process_tile<1, 0, 5, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 5, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 5, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 5, 0, 2, 2>,
+            Conv::template process_tile<1, 0, 5, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 5, 0, 3, 0>,
+            Conv::template process_tile<1, 0, 5, 0, 3, 1>,
+            Conv::template process_tile<1, 0, 5, 0, 3, 2>,
+            Conv::template process_tile<1, 0, 5, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 5, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 5, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 5, 1, 0, 2>,
+            Conv::template process_tile<1, 0, 5, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 5, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 5, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 5, 1, 1, 2>,
+            Conv::template process_tile<1, 0, 5, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 5, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 5, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 5, 1, 2, 2>,
+            Conv::template process_tile<1, 0, 5, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 5, 1, 3, 0>,
+            Conv::template process_tile<1, 0, 5, 1, 3, 1>,
+            Conv::template process_tile<1, 0, 5, 1, 3, 2>,
+            Conv::template process_tile<1, 0, 5, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 5, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 5, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 5, 2, 0, 2>,
+            Conv::template process_tile<1, 0, 5, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 5, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 5, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 5, 2, 1, 2>,
+            Conv::template process_tile<1, 0, 5, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 5, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 5, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 5, 2, 2, 2>,
+            Conv::template process_tile<1, 0, 5, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 5, 2, 3, 0>,
+            Conv::template process_tile<1, 0, 5, 2, 3, 1>,
+            Conv::template process_tile<1, 0, 5, 2, 3, 2>,
+            Conv::template process_tile<1, 0, 5, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 5, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 5, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 5, 3, 0, 2>,
+            Conv::template process_tile<1, 0, 5, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 5, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 5, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 5, 3, 1, 2>,
+            Conv::template process_tile<1, 0, 5, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 5, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 5, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 5, 3, 2, 2>,
+            Conv::template process_tile<1, 0, 5, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 5, 3, 3, 0>,
+            Conv::template process_tile<1, 0, 5, 3, 3, 1>,
+            Conv::template process_tile<1, 0, 5, 3, 3, 2>,
+            Conv::template process_tile<1, 0, 5, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 5, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 5, 4, 0, 1>,
+            Conv::template process_tile<1, 0, 5, 4, 0, 2>,
+            Conv::template process_tile<1, 0, 5, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 5, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 5, 4, 1, 1>,
+            Conv::template process_tile<1, 0, 5, 4, 1, 2>,
+            Conv::template process_tile<1, 0, 5, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 5, 4, 2, 0>,
+            Conv::template process_tile<1, 0, 5, 4, 2, 1>,
+            Conv::template process_tile<1, 0, 5, 4, 2, 2>,
+            Conv::template process_tile<1, 0, 5, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 5, 4, 3, 0>,
+            Conv::template process_tile<1, 0, 5, 4, 3, 1>,
+            Conv::template process_tile<1, 0, 5, 4, 3, 2>,
+            Conv::template process_tile<1, 0, 5, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 5, 5, 0, 0>,
+            Conv::template process_tile<1, 0, 5, 5, 0, 1>,
+            Conv::template process_tile<1, 0, 5, 5, 0, 2>,
+            Conv::template process_tile<1, 0, 5, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 5, 5, 1, 0>,
+            Conv::template process_tile<1, 0, 5, 5, 1, 1>,
+            Conv::template process_tile<1, 0, 5, 5, 1, 2>,
+            Conv::template process_tile<1, 0, 5, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 5, 5, 2, 0>,
+            Conv::template process_tile<1, 0, 5, 5, 2, 1>,
+            Conv::template process_tile<1, 0, 5, 5, 2, 2>,
+            Conv::template process_tile<1, 0, 5, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 5, 5, 3, 0>,
+            Conv::template process_tile<1, 0, 5, 5, 3, 1>,
+            Conv::template process_tile<1, 0, 5, 5, 3, 2>,
+            Conv::template process_tile<1, 0, 5, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 5, 6, 0, 0>,
+            Conv::template process_tile<1, 0, 5, 6, 0, 1>,
+            Conv::template process_tile<1, 0, 5, 6, 0, 2>,
+            Conv::template process_tile<1, 0, 5, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 5, 6, 1, 0>,
+            Conv::template process_tile<1, 0, 5, 6, 1, 1>,
+            Conv::template process_tile<1, 0, 5, 6, 1, 2>,
+            Conv::template process_tile<1, 0, 5, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 5, 6, 2, 0>,
+            Conv::template process_tile<1, 0, 5, 6, 2, 1>,
+            Conv::template process_tile<1, 0, 5, 6, 2, 2>,
+            Conv::template process_tile<1, 0, 5, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 5, 6, 3, 0>,
+            Conv::template process_tile<1, 0, 5, 6, 3, 1>,
+            Conv::template process_tile<1, 0, 5, 6, 3, 2>,
+            Conv::template process_tile<1, 0, 5, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 5
+      {  // Input pad bottom = 6
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 6, 0, 0, 0>,
+            Conv::template process_tile<1, 0, 6, 0, 0, 1>,
+            Conv::template process_tile<1, 0, 6, 0, 0, 2>,
+            Conv::template process_tile<1, 0, 6, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 6, 0, 1, 0>,
+            Conv::template process_tile<1, 0, 6, 0, 1, 1>,
+            Conv::template process_tile<1, 0, 6, 0, 1, 2>,
+            Conv::template process_tile<1, 0, 6, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 6, 0, 2, 0>,
+            Conv::template process_tile<1, 0, 6, 0, 2, 1>,
+            Conv::template process_tile<1, 0, 6, 0, 2, 2>,
+            Conv::template process_tile<1, 0, 6, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 6, 0, 3, 0>,
+            Conv::template process_tile<1, 0, 6, 0, 3, 1>,
+            Conv::template process_tile<1, 0, 6, 0, 3, 2>,
+            Conv::template process_tile<1, 0, 6, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 6, 1, 0, 0>,
+            Conv::template process_tile<1, 0, 6, 1, 0, 1>,
+            Conv::template process_tile<1, 0, 6, 1, 0, 2>,
+            Conv::template process_tile<1, 0, 6, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 6, 1, 1, 0>,
+            Conv::template process_tile<1, 0, 6, 1, 1, 1>,
+            Conv::template process_tile<1, 0, 6, 1, 1, 2>,
+            Conv::template process_tile<1, 0, 6, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 6, 1, 2, 0>,
+            Conv::template process_tile<1, 0, 6, 1, 2, 1>,
+            Conv::template process_tile<1, 0, 6, 1, 2, 2>,
+            Conv::template process_tile<1, 0, 6, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 6, 1, 3, 0>,
+            Conv::template process_tile<1, 0, 6, 1, 3, 1>,
+            Conv::template process_tile<1, 0, 6, 1, 3, 2>,
+            Conv::template process_tile<1, 0, 6, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 6, 2, 0, 0>,
+            Conv::template process_tile<1, 0, 6, 2, 0, 1>,
+            Conv::template process_tile<1, 0, 6, 2, 0, 2>,
+            Conv::template process_tile<1, 0, 6, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 6, 2, 1, 0>,
+            Conv::template process_tile<1, 0, 6, 2, 1, 1>,
+            Conv::template process_tile<1, 0, 6, 2, 1, 2>,
+            Conv::template process_tile<1, 0, 6, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 6, 2, 2, 0>,
+            Conv::template process_tile<1, 0, 6, 2, 2, 1>,
+            Conv::template process_tile<1, 0, 6, 2, 2, 2>,
+            Conv::template process_tile<1, 0, 6, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 6, 2, 3, 0>,
+            Conv::template process_tile<1, 0, 6, 2, 3, 1>,
+            Conv::template process_tile<1, 0, 6, 2, 3, 2>,
+            Conv::template process_tile<1, 0, 6, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 6, 3, 0, 0>,
+            Conv::template process_tile<1, 0, 6, 3, 0, 1>,
+            Conv::template process_tile<1, 0, 6, 3, 0, 2>,
+            Conv::template process_tile<1, 0, 6, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 6, 3, 1, 0>,
+            Conv::template process_tile<1, 0, 6, 3, 1, 1>,
+            Conv::template process_tile<1, 0, 6, 3, 1, 2>,
+            Conv::template process_tile<1, 0, 6, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 6, 3, 2, 0>,
+            Conv::template process_tile<1, 0, 6, 3, 2, 1>,
+            Conv::template process_tile<1, 0, 6, 3, 2, 2>,
+            Conv::template process_tile<1, 0, 6, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 6, 3, 3, 0>,
+            Conv::template process_tile<1, 0, 6, 3, 3, 1>,
+            Conv::template process_tile<1, 0, 6, 3, 3, 2>,
+            Conv::template process_tile<1, 0, 6, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 6, 4, 0, 0>,
+            Conv::template process_tile<1, 0, 6, 4, 0, 1>,
+            Conv::template process_tile<1, 0, 6, 4, 0, 2>,
+            Conv::template process_tile<1, 0, 6, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 6, 4, 1, 0>,
+            Conv::template process_tile<1, 0, 6, 4, 1, 1>,
+            Conv::template process_tile<1, 0, 6, 4, 1, 2>,
+            Conv::template process_tile<1, 0, 6, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 6, 4, 2, 0>,
+            Conv::template process_tile<1, 0, 6, 4, 2, 1>,
+            Conv::template process_tile<1, 0, 6, 4, 2, 2>,
+            Conv::template process_tile<1, 0, 6, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 6, 4, 3, 0>,
+            Conv::template process_tile<1, 0, 6, 4, 3, 1>,
+            Conv::template process_tile<1, 0, 6, 4, 3, 2>,
+            Conv::template process_tile<1, 0, 6, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 6, 5, 0, 0>,
+            Conv::template process_tile<1, 0, 6, 5, 0, 1>,
+            Conv::template process_tile<1, 0, 6, 5, 0, 2>,
+            Conv::template process_tile<1, 0, 6, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 6, 5, 1, 0>,
+            Conv::template process_tile<1, 0, 6, 5, 1, 1>,
+            Conv::template process_tile<1, 0, 6, 5, 1, 2>,
+            Conv::template process_tile<1, 0, 6, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 6, 5, 2, 0>,
+            Conv::template process_tile<1, 0, 6, 5, 2, 1>,
+            Conv::template process_tile<1, 0, 6, 5, 2, 2>,
+            Conv::template process_tile<1, 0, 6, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 6, 5, 3, 0>,
+            Conv::template process_tile<1, 0, 6, 5, 3, 1>,
+            Conv::template process_tile<1, 0, 6, 5, 3, 2>,
+            Conv::template process_tile<1, 0, 6, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 0, 6, 6, 0, 0>,
+            Conv::template process_tile<1, 0, 6, 6, 0, 1>,
+            Conv::template process_tile<1, 0, 6, 6, 0, 2>,
+            Conv::template process_tile<1, 0, 6, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 0, 6, 6, 1, 0>,
+            Conv::template process_tile<1, 0, 6, 6, 1, 1>,
+            Conv::template process_tile<1, 0, 6, 6, 1, 2>,
+            Conv::template process_tile<1, 0, 6, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 0, 6, 6, 2, 0>,
+            Conv::template process_tile<1, 0, 6, 6, 2, 1>,
+            Conv::template process_tile<1, 0, 6, 6, 2, 2>,
+            Conv::template process_tile<1, 0, 6, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 0, 6, 6, 3, 0>,
+            Conv::template process_tile<1, 0, 6, 6, 3, 1>,
+            Conv::template process_tile<1, 0, 6, 6, 3, 2>,
+            Conv::template process_tile<1, 0, 6, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 6
+    },  // Input pad left = 0
+    {  // Input pad left = 1
+      {  // Input pad bottom = 0
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 0, 0, 2>,
+            Conv::template process_tile<1, 1, 0, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 0, 1, 2>,
+            Conv::template process_tile<1, 1, 0, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 0, 2, 2>,
+            Conv::template process_tile<1, 1, 0, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 0, 0, 3, 0>,
+            Conv::template process_tile<1, 1, 0, 0, 3, 1>,
+            Conv::template process_tile<1, 1, 0, 0, 3, 2>,
+            Conv::template process_tile<1, 1, 0, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 1, 0, 2>,
+            Conv::template process_tile<1, 1, 0, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 1, 1, 2>,
+            Conv::template process_tile<1, 1, 0, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 1, 2, 2>,
+            Conv::template process_tile<1, 1, 0, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 0, 1, 3, 0>,
+            Conv::template process_tile<1, 1, 0, 1, 3, 1>,
+            Conv::template process_tile<1, 1, 0, 1, 3, 2>,
+            Conv::template process_tile<1, 1, 0, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 2, 0, 2>,
+            Conv::template process_tile<1, 1, 0, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 2, 1, 2>,
+            Conv::template process_tile<1, 1, 0, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 2, 2, 2>,
+            Conv::template process_tile<1, 1, 0, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 0, 2, 3, 0>,
+            Conv::template process_tile<1, 1, 0, 2, 3, 1>,
+            Conv::template process_tile<1, 1, 0, 2, 3, 2>,
+            Conv::template process_tile<1, 1, 0, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 3, 0, 2>,
+            Conv::template process_tile<1, 1, 0, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 3, 1, 2>,
+            Conv::template process_tile<1, 1, 0, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 3, 2, 2>,
+            Conv::template process_tile<1, 1, 0, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 0, 3, 3, 0>,
+            Conv::template process_tile<1, 1, 0, 3, 3, 1>,
+            Conv::template process_tile<1, 1, 0, 3, 3, 2>,
+            Conv::template process_tile<1, 1, 0, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 4, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 4, 0, 2>,
+            Conv::template process_tile<1, 1, 0, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 4, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 4, 1, 2>,
+            Conv::template process_tile<1, 1, 0, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 4, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 4, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 4, 2, 2>,
+            Conv::template process_tile<1, 1, 0, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 0, 4, 3, 0>,
+            Conv::template process_tile<1, 1, 0, 4, 3, 1>,
+            Conv::template process_tile<1, 1, 0, 4, 3, 2>,
+            Conv::template process_tile<1, 1, 0, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 5, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 5, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 5, 0, 2>,
+            Conv::template process_tile<1, 1, 0, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 5, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 5, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 5, 1, 2>,
+            Conv::template process_tile<1, 1, 0, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 5, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 5, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 5, 2, 2>,
+            Conv::template process_tile<1, 1, 0, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 0, 5, 3, 0>,
+            Conv::template process_tile<1, 1, 0, 5, 3, 1>,
+            Conv::template process_tile<1, 1, 0, 5, 3, 2>,
+            Conv::template process_tile<1, 1, 0, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 0, 6, 0, 0>,
+            Conv::template process_tile<1, 1, 0, 6, 0, 1>,
+            Conv::template process_tile<1, 1, 0, 6, 0, 2>,
+            Conv::template process_tile<1, 1, 0, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 0, 6, 1, 0>,
+            Conv::template process_tile<1, 1, 0, 6, 1, 1>,
+            Conv::template process_tile<1, 1, 0, 6, 1, 2>,
+            Conv::template process_tile<1, 1, 0, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 0, 6, 2, 0>,
+            Conv::template process_tile<1, 1, 0, 6, 2, 1>,
+            Conv::template process_tile<1, 1, 0, 6, 2, 2>,
+            Conv::template process_tile<1, 1, 0, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 0, 6, 3, 0>,
+            Conv::template process_tile<1, 1, 0, 6, 3, 1>,
+            Conv::template process_tile<1, 1, 0, 6, 3, 2>,
+            Conv::template process_tile<1, 1, 0, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 0
+      {  // Input pad bottom = 1
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 0, 0, 2>,
+            Conv::template process_tile<1, 1, 1, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 0, 1, 2>,
+            Conv::template process_tile<1, 1, 1, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 0, 2, 2>,
+            Conv::template process_tile<1, 1, 1, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 1, 0, 3, 0>,
+            Conv::template process_tile<1, 1, 1, 0, 3, 1>,
+            Conv::template process_tile<1, 1, 1, 0, 3, 2>,
+            Conv::template process_tile<1, 1, 1, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 1, 0, 2>,
+            Conv::template process_tile<1, 1, 1, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 1, 1, 2>,
+            Conv::template process_tile<1, 1, 1, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 1, 2, 2>,
+            Conv::template process_tile<1, 1, 1, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 1, 1, 3, 0>,
+            Conv::template process_tile<1, 1, 1, 1, 3, 1>,
+            Conv::template process_tile<1, 1, 1, 1, 3, 2>,
+            Conv::template process_tile<1, 1, 1, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 2, 0, 2>,
+            Conv::template process_tile<1, 1, 1, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 2, 1, 2>,
+            Conv::template process_tile<1, 1, 1, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 2, 2, 2>,
+            Conv::template process_tile<1, 1, 1, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 1, 2, 3, 0>,
+            Conv::template process_tile<1, 1, 1, 2, 3, 1>,
+            Conv::template process_tile<1, 1, 1, 2, 3, 2>,
+            Conv::template process_tile<1, 1, 1, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 3, 0, 2>,
+            Conv::template process_tile<1, 1, 1, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 3, 1, 2>,
+            Conv::template process_tile<1, 1, 1, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 3, 2, 2>,
+            Conv::template process_tile<1, 1, 1, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 1, 3, 3, 0>,
+            Conv::template process_tile<1, 1, 1, 3, 3, 1>,
+            Conv::template process_tile<1, 1, 1, 3, 3, 2>,
+            Conv::template process_tile<1, 1, 1, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 4, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 4, 0, 2>,
+            Conv::template process_tile<1, 1, 1, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 4, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 4, 1, 2>,
+            Conv::template process_tile<1, 1, 1, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 4, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 4, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 4, 2, 2>,
+            Conv::template process_tile<1, 1, 1, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 1, 4, 3, 0>,
+            Conv::template process_tile<1, 1, 1, 4, 3, 1>,
+            Conv::template process_tile<1, 1, 1, 4, 3, 2>,
+            Conv::template process_tile<1, 1, 1, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 5, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 5, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 5, 0, 2>,
+            Conv::template process_tile<1, 1, 1, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 5, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 5, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 5, 1, 2>,
+            Conv::template process_tile<1, 1, 1, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 5, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 5, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 5, 2, 2>,
+            Conv::template process_tile<1, 1, 1, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 1, 5, 3, 0>,
+            Conv::template process_tile<1, 1, 1, 5, 3, 1>,
+            Conv::template process_tile<1, 1, 1, 5, 3, 2>,
+            Conv::template process_tile<1, 1, 1, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 1, 6, 0, 0>,
+            Conv::template process_tile<1, 1, 1, 6, 0, 1>,
+            Conv::template process_tile<1, 1, 1, 6, 0, 2>,
+            Conv::template process_tile<1, 1, 1, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 1, 6, 1, 0>,
+            Conv::template process_tile<1, 1, 1, 6, 1, 1>,
+            Conv::template process_tile<1, 1, 1, 6, 1, 2>,
+            Conv::template process_tile<1, 1, 1, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 1, 6, 2, 0>,
+            Conv::template process_tile<1, 1, 1, 6, 2, 1>,
+            Conv::template process_tile<1, 1, 1, 6, 2, 2>,
+            Conv::template process_tile<1, 1, 1, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 1, 6, 3, 0>,
+            Conv::template process_tile<1, 1, 1, 6, 3, 1>,
+            Conv::template process_tile<1, 1, 1, 6, 3, 2>,
+            Conv::template process_tile<1, 1, 1, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 1
+      {  // Input pad bottom = 2
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 0, 0, 2>,
+            Conv::template process_tile<1, 1, 2, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 0, 1, 2>,
+            Conv::template process_tile<1, 1, 2, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 0, 2, 2>,
+            Conv::template process_tile<1, 1, 2, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 2, 0, 3, 0>,
+            Conv::template process_tile<1, 1, 2, 0, 3, 1>,
+            Conv::template process_tile<1, 1, 2, 0, 3, 2>,
+            Conv::template process_tile<1, 1, 2, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 1, 0, 2>,
+            Conv::template process_tile<1, 1, 2, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 1, 1, 2>,
+            Conv::template process_tile<1, 1, 2, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 1, 2, 2>,
+            Conv::template process_tile<1, 1, 2, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 2, 1, 3, 0>,
+            Conv::template process_tile<1, 1, 2, 1, 3, 1>,
+            Conv::template process_tile<1, 1, 2, 1, 3, 2>,
+            Conv::template process_tile<1, 1, 2, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 2, 0, 2>,
+            Conv::template process_tile<1, 1, 2, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 2, 1, 2>,
+            Conv::template process_tile<1, 1, 2, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 2, 2, 2>,
+            Conv::template process_tile<1, 1, 2, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 2, 2, 3, 0>,
+            Conv::template process_tile<1, 1, 2, 2, 3, 1>,
+            Conv::template process_tile<1, 1, 2, 2, 3, 2>,
+            Conv::template process_tile<1, 1, 2, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 3, 0, 2>,
+            Conv::template process_tile<1, 1, 2, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 3, 1, 2>,
+            Conv::template process_tile<1, 1, 2, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 3, 2, 2>,
+            Conv::template process_tile<1, 1, 2, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 2, 3, 3, 0>,
+            Conv::template process_tile<1, 1, 2, 3, 3, 1>,
+            Conv::template process_tile<1, 1, 2, 3, 3, 2>,
+            Conv::template process_tile<1, 1, 2, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 4, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 4, 0, 2>,
+            Conv::template process_tile<1, 1, 2, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 4, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 4, 1, 2>,
+            Conv::template process_tile<1, 1, 2, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 4, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 4, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 4, 2, 2>,
+            Conv::template process_tile<1, 1, 2, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 2, 4, 3, 0>,
+            Conv::template process_tile<1, 1, 2, 4, 3, 1>,
+            Conv::template process_tile<1, 1, 2, 4, 3, 2>,
+            Conv::template process_tile<1, 1, 2, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 5, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 5, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 5, 0, 2>,
+            Conv::template process_tile<1, 1, 2, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 5, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 5, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 5, 1, 2>,
+            Conv::template process_tile<1, 1, 2, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 5, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 5, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 5, 2, 2>,
+            Conv::template process_tile<1, 1, 2, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 2, 5, 3, 0>,
+            Conv::template process_tile<1, 1, 2, 5, 3, 1>,
+            Conv::template process_tile<1, 1, 2, 5, 3, 2>,
+            Conv::template process_tile<1, 1, 2, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 2, 6, 0, 0>,
+            Conv::template process_tile<1, 1, 2, 6, 0, 1>,
+            Conv::template process_tile<1, 1, 2, 6, 0, 2>,
+            Conv::template process_tile<1, 1, 2, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 2, 6, 1, 0>,
+            Conv::template process_tile<1, 1, 2, 6, 1, 1>,
+            Conv::template process_tile<1, 1, 2, 6, 1, 2>,
+            Conv::template process_tile<1, 1, 2, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 2, 6, 2, 0>,
+            Conv::template process_tile<1, 1, 2, 6, 2, 1>,
+            Conv::template process_tile<1, 1, 2, 6, 2, 2>,
+            Conv::template process_tile<1, 1, 2, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 2, 6, 3, 0>,
+            Conv::template process_tile<1, 1, 2, 6, 3, 1>,
+            Conv::template process_tile<1, 1, 2, 6, 3, 2>,
+            Conv::template process_tile<1, 1, 2, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 2
+      {  // Input pad bottom = 3
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 0, 0, 2>,
+            Conv::template process_tile<1, 1, 3, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 0, 1, 2>,
+            Conv::template process_tile<1, 1, 3, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 0, 2, 2>,
+            Conv::template process_tile<1, 1, 3, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 3, 0, 3, 0>,
+            Conv::template process_tile<1, 1, 3, 0, 3, 1>,
+            Conv::template process_tile<1, 1, 3, 0, 3, 2>,
+            Conv::template process_tile<1, 1, 3, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 1, 0, 2>,
+            Conv::template process_tile<1, 1, 3, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 1, 1, 2>,
+            Conv::template process_tile<1, 1, 3, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 1, 2, 2>,
+            Conv::template process_tile<1, 1, 3, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 3, 1, 3, 0>,
+            Conv::template process_tile<1, 1, 3, 1, 3, 1>,
+            Conv::template process_tile<1, 1, 3, 1, 3, 2>,
+            Conv::template process_tile<1, 1, 3, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 2, 0, 2>,
+            Conv::template process_tile<1, 1, 3, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 2, 1, 2>,
+            Conv::template process_tile<1, 1, 3, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 2, 2, 2>,
+            Conv::template process_tile<1, 1, 3, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 3, 2, 3, 0>,
+            Conv::template process_tile<1, 1, 3, 2, 3, 1>,
+            Conv::template process_tile<1, 1, 3, 2, 3, 2>,
+            Conv::template process_tile<1, 1, 3, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 3, 0, 2>,
+            Conv::template process_tile<1, 1, 3, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 3, 1, 2>,
+            Conv::template process_tile<1, 1, 3, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 3, 2, 2>,
+            Conv::template process_tile<1, 1, 3, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 3, 3, 3, 0>,
+            Conv::template process_tile<1, 1, 3, 3, 3, 1>,
+            Conv::template process_tile<1, 1, 3, 3, 3, 2>,
+            Conv::template process_tile<1, 1, 3, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 4, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 4, 0, 2>,
+            Conv::template process_tile<1, 1, 3, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 4, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 4, 1, 2>,
+            Conv::template process_tile<1, 1, 3, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 4, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 4, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 4, 2, 2>,
+            Conv::template process_tile<1, 1, 3, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 3, 4, 3, 0>,
+            Conv::template process_tile<1, 1, 3, 4, 3, 1>,
+            Conv::template process_tile<1, 1, 3, 4, 3, 2>,
+            Conv::template process_tile<1, 1, 3, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 5, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 5, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 5, 0, 2>,
+            Conv::template process_tile<1, 1, 3, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 5, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 5, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 5, 1, 2>,
+            Conv::template process_tile<1, 1, 3, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 5, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 5, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 5, 2, 2>,
+            Conv::template process_tile<1, 1, 3, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 3, 5, 3, 0>,
+            Conv::template process_tile<1, 1, 3, 5, 3, 1>,
+            Conv::template process_tile<1, 1, 3, 5, 3, 2>,
+            Conv::template process_tile<1, 1, 3, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 3, 6, 0, 0>,
+            Conv::template process_tile<1, 1, 3, 6, 0, 1>,
+            Conv::template process_tile<1, 1, 3, 6, 0, 2>,
+            Conv::template process_tile<1, 1, 3, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 3, 6, 1, 0>,
+            Conv::template process_tile<1, 1, 3, 6, 1, 1>,
+            Conv::template process_tile<1, 1, 3, 6, 1, 2>,
+            Conv::template process_tile<1, 1, 3, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 3, 6, 2, 0>,
+            Conv::template process_tile<1, 1, 3, 6, 2, 1>,
+            Conv::template process_tile<1, 1, 3, 6, 2, 2>,
+            Conv::template process_tile<1, 1, 3, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 3, 6, 3, 0>,
+            Conv::template process_tile<1, 1, 3, 6, 3, 1>,
+            Conv::template process_tile<1, 1, 3, 6, 3, 2>,
+            Conv::template process_tile<1, 1, 3, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 3
+      {  // Input pad bottom = 4
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 4, 0, 0, 2>,
+            Conv::template process_tile<1, 1, 4, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 4, 0, 1, 2>,
+            Conv::template process_tile<1, 1, 4, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 4, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 4, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 4, 0, 2, 2>,
+            Conv::template process_tile<1, 1, 4, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 4, 0, 3, 0>,
+            Conv::template process_tile<1, 1, 4, 0, 3, 1>,
+            Conv::template process_tile<1, 1, 4, 0, 3, 2>,
+            Conv::template process_tile<1, 1, 4, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 4, 1, 0, 2>,
+            Conv::template process_tile<1, 1, 4, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 4, 1, 1, 2>,
+            Conv::template process_tile<1, 1, 4, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 4, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 4, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 4, 1, 2, 2>,
+            Conv::template process_tile<1, 1, 4, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 4, 1, 3, 0>,
+            Conv::template process_tile<1, 1, 4, 1, 3, 1>,
+            Conv::template process_tile<1, 1, 4, 1, 3, 2>,
+            Conv::template process_tile<1, 1, 4, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 4, 2, 0, 2>,
+            Conv::template process_tile<1, 1, 4, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 4, 2, 1, 2>,
+            Conv::template process_tile<1, 1, 4, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 4, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 4, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 4, 2, 2, 2>,
+            Conv::template process_tile<1, 1, 4, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 4, 2, 3, 0>,
+            Conv::template process_tile<1, 1, 4, 2, 3, 1>,
+            Conv::template process_tile<1, 1, 4, 2, 3, 2>,
+            Conv::template process_tile<1, 1, 4, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 4, 3, 0, 2>,
+            Conv::template process_tile<1, 1, 4, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 4, 3, 1, 2>,
+            Conv::template process_tile<1, 1, 4, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 4, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 4, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 4, 3, 2, 2>,
+            Conv::template process_tile<1, 1, 4, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 4, 3, 3, 0>,
+            Conv::template process_tile<1, 1, 4, 3, 3, 1>,
+            Conv::template process_tile<1, 1, 4, 3, 3, 2>,
+            Conv::template process_tile<1, 1, 4, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 4, 0, 1>,
+            Conv::template process_tile<1, 1, 4, 4, 0, 2>,
+            Conv::template process_tile<1, 1, 4, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 4, 1, 1>,
+            Conv::template process_tile<1, 1, 4, 4, 1, 2>,
+            Conv::template process_tile<1, 1, 4, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 4, 4, 2, 0>,
+            Conv::template process_tile<1, 1, 4, 4, 2, 1>,
+            Conv::template process_tile<1, 1, 4, 4, 2, 2>,
+            Conv::template process_tile<1, 1, 4, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 4, 4, 3, 0>,
+            Conv::template process_tile<1, 1, 4, 4, 3, 1>,
+            Conv::template process_tile<1, 1, 4, 4, 3, 2>,
+            Conv::template process_tile<1, 1, 4, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 5, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 5, 0, 1>,
+            Conv::template process_tile<1, 1, 4, 5, 0, 2>,
+            Conv::template process_tile<1, 1, 4, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 5, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 5, 1, 1>,
+            Conv::template process_tile<1, 1, 4, 5, 1, 2>,
+            Conv::template process_tile<1, 1, 4, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 4, 5, 2, 0>,
+            Conv::template process_tile<1, 1, 4, 5, 2, 1>,
+            Conv::template process_tile<1, 1, 4, 5, 2, 2>,
+            Conv::template process_tile<1, 1, 4, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 4, 5, 3, 0>,
+            Conv::template process_tile<1, 1, 4, 5, 3, 1>,
+            Conv::template process_tile<1, 1, 4, 5, 3, 2>,
+            Conv::template process_tile<1, 1, 4, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 4, 6, 0, 0>,
+            Conv::template process_tile<1, 1, 4, 6, 0, 1>,
+            Conv::template process_tile<1, 1, 4, 6, 0, 2>,
+            Conv::template process_tile<1, 1, 4, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 4, 6, 1, 0>,
+            Conv::template process_tile<1, 1, 4, 6, 1, 1>,
+            Conv::template process_tile<1, 1, 4, 6, 1, 2>,
+            Conv::template process_tile<1, 1, 4, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 4, 6, 2, 0>,
+            Conv::template process_tile<1, 1, 4, 6, 2, 1>,
+            Conv::template process_tile<1, 1, 4, 6, 2, 2>,
+            Conv::template process_tile<1, 1, 4, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 4, 6, 3, 0>,
+            Conv::template process_tile<1, 1, 4, 6, 3, 1>,
+            Conv::template process_tile<1, 1, 4, 6, 3, 2>,
+            Conv::template process_tile<1, 1, 4, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 4
+      {  // Input pad bottom = 5
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 5, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 5, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 5, 0, 0, 2>,
+            Conv::template process_tile<1, 1, 5, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 5, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 5, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 5, 0, 1, 2>,
+            Conv::template process_tile<1, 1, 5, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 5, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 5, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 5, 0, 2, 2>,
+            Conv::template process_tile<1, 1, 5, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 5, 0, 3, 0>,
+            Conv::template process_tile<1, 1, 5, 0, 3, 1>,
+            Conv::template process_tile<1, 1, 5, 0, 3, 2>,
+            Conv::template process_tile<1, 1, 5, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 5, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 5, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 5, 1, 0, 2>,
+            Conv::template process_tile<1, 1, 5, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 5, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 5, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 5, 1, 1, 2>,
+            Conv::template process_tile<1, 1, 5, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 5, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 5, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 5, 1, 2, 2>,
+            Conv::template process_tile<1, 1, 5, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 5, 1, 3, 0>,
+            Conv::template process_tile<1, 1, 5, 1, 3, 1>,
+            Conv::template process_tile<1, 1, 5, 1, 3, 2>,
+            Conv::template process_tile<1, 1, 5, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 5, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 5, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 5, 2, 0, 2>,
+            Conv::template process_tile<1, 1, 5, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 5, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 5, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 5, 2, 1, 2>,
+            Conv::template process_tile<1, 1, 5, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 5, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 5, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 5, 2, 2, 2>,
+            Conv::template process_tile<1, 1, 5, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 5, 2, 3, 0>,
+            Conv::template process_tile<1, 1, 5, 2, 3, 1>,
+            Conv::template process_tile<1, 1, 5, 2, 3, 2>,
+            Conv::template process_tile<1, 1, 5, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 5, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 5, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 5, 3, 0, 2>,
+            Conv::template process_tile<1, 1, 5, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 5, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 5, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 5, 3, 1, 2>,
+            Conv::template process_tile<1, 1, 5, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 5, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 5, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 5, 3, 2, 2>,
+            Conv::template process_tile<1, 1, 5, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 5, 3, 3, 0>,
+            Conv::template process_tile<1, 1, 5, 3, 3, 1>,
+            Conv::template process_tile<1, 1, 5, 3, 3, 2>,
+            Conv::template process_tile<1, 1, 5, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 5, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 5, 4, 0, 1>,
+            Conv::template process_tile<1, 1, 5, 4, 0, 2>,
+            Conv::template process_tile<1, 1, 5, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 5, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 5, 4, 1, 1>,
+            Conv::template process_tile<1, 1, 5, 4, 1, 2>,
+            Conv::template process_tile<1, 1, 5, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 5, 4, 2, 0>,
+            Conv::template process_tile<1, 1, 5, 4, 2, 1>,
+            Conv::template process_tile<1, 1, 5, 4, 2, 2>,
+            Conv::template process_tile<1, 1, 5, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 5, 4, 3, 0>,
+            Conv::template process_tile<1, 1, 5, 4, 3, 1>,
+            Conv::template process_tile<1, 1, 5, 4, 3, 2>,
+            Conv::template process_tile<1, 1, 5, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 5, 5, 0, 0>,
+            Conv::template process_tile<1, 1, 5, 5, 0, 1>,
+            Conv::template process_tile<1, 1, 5, 5, 0, 2>,
+            Conv::template process_tile<1, 1, 5, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 5, 5, 1, 0>,
+            Conv::template process_tile<1, 1, 5, 5, 1, 1>,
+            Conv::template process_tile<1, 1, 5, 5, 1, 2>,
+            Conv::template process_tile<1, 1, 5, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 5, 5, 2, 0>,
+            Conv::template process_tile<1, 1, 5, 5, 2, 1>,
+            Conv::template process_tile<1, 1, 5, 5, 2, 2>,
+            Conv::template process_tile<1, 1, 5, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 5, 5, 3, 0>,
+            Conv::template process_tile<1, 1, 5, 5, 3, 1>,
+            Conv::template process_tile<1, 1, 5, 5, 3, 2>,
+            Conv::template process_tile<1, 1, 5, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 5, 6, 0, 0>,
+            Conv::template process_tile<1, 1, 5, 6, 0, 1>,
+            Conv::template process_tile<1, 1, 5, 6, 0, 2>,
+            Conv::template process_tile<1, 1, 5, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 5, 6, 1, 0>,
+            Conv::template process_tile<1, 1, 5, 6, 1, 1>,
+            Conv::template process_tile<1, 1, 5, 6, 1, 2>,
+            Conv::template process_tile<1, 1, 5, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 5, 6, 2, 0>,
+            Conv::template process_tile<1, 1, 5, 6, 2, 1>,
+            Conv::template process_tile<1, 1, 5, 6, 2, 2>,
+            Conv::template process_tile<1, 1, 5, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 5, 6, 3, 0>,
+            Conv::template process_tile<1, 1, 5, 6, 3, 1>,
+            Conv::template process_tile<1, 1, 5, 6, 3, 2>,
+            Conv::template process_tile<1, 1, 5, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 5
+      {  // Input pad bottom = 6
+        {  // Input pad right = 0
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 6, 0, 0, 0>,
+            Conv::template process_tile<1, 1, 6, 0, 0, 1>,
+            Conv::template process_tile<1, 1, 6, 0, 0, 2>,
+            Conv::template process_tile<1, 1, 6, 0, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 6, 0, 1, 0>,
+            Conv::template process_tile<1, 1, 6, 0, 1, 1>,
+            Conv::template process_tile<1, 1, 6, 0, 1, 2>,
+            Conv::template process_tile<1, 1, 6, 0, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 6, 0, 2, 0>,
+            Conv::template process_tile<1, 1, 6, 0, 2, 1>,
+            Conv::template process_tile<1, 1, 6, 0, 2, 2>,
+            Conv::template process_tile<1, 1, 6, 0, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 6, 0, 3, 0>,
+            Conv::template process_tile<1, 1, 6, 0, 3, 1>,
+            Conv::template process_tile<1, 1, 6, 0, 3, 2>,
+            Conv::template process_tile<1, 1, 6, 0, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 0
+        {  // Input pad right = 1
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 6, 1, 0, 0>,
+            Conv::template process_tile<1, 1, 6, 1, 0, 1>,
+            Conv::template process_tile<1, 1, 6, 1, 0, 2>,
+            Conv::template process_tile<1, 1, 6, 1, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 6, 1, 1, 0>,
+            Conv::template process_tile<1, 1, 6, 1, 1, 1>,
+            Conv::template process_tile<1, 1, 6, 1, 1, 2>,
+            Conv::template process_tile<1, 1, 6, 1, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 6, 1, 2, 0>,
+            Conv::template process_tile<1, 1, 6, 1, 2, 1>,
+            Conv::template process_tile<1, 1, 6, 1, 2, 2>,
+            Conv::template process_tile<1, 1, 6, 1, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 6, 1, 3, 0>,
+            Conv::template process_tile<1, 1, 6, 1, 3, 1>,
+            Conv::template process_tile<1, 1, 6, 1, 3, 2>,
+            Conv::template process_tile<1, 1, 6, 1, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 1
+        {  // Input pad right = 2
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 6, 2, 0, 0>,
+            Conv::template process_tile<1, 1, 6, 2, 0, 1>,
+            Conv::template process_tile<1, 1, 6, 2, 0, 2>,
+            Conv::template process_tile<1, 1, 6, 2, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 6, 2, 1, 0>,
+            Conv::template process_tile<1, 1, 6, 2, 1, 1>,
+            Conv::template process_tile<1, 1, 6, 2, 1, 2>,
+            Conv::template process_tile<1, 1, 6, 2, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 6, 2, 2, 0>,
+            Conv::template process_tile<1, 1, 6, 2, 2, 1>,
+            Conv::template process_tile<1, 1, 6, 2, 2, 2>,
+            Conv::template process_tile<1, 1, 6, 2, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 6, 2, 3, 0>,
+            Conv::template process_tile<1, 1, 6, 2, 3, 1>,
+            Conv::template process_tile<1, 1, 6, 2, 3, 2>,
+            Conv::template process_tile<1, 1, 6, 2, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 2
+        {  // Input pad right = 3
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 6, 3, 0, 0>,
+            Conv::template process_tile<1, 1, 6, 3, 0, 1>,
+            Conv::template process_tile<1, 1, 6, 3, 0, 2>,
+            Conv::template process_tile<1, 1, 6, 3, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 6, 3, 1, 0>,
+            Conv::template process_tile<1, 1, 6, 3, 1, 1>,
+            Conv::template process_tile<1, 1, 6, 3, 1, 2>,
+            Conv::template process_tile<1, 1, 6, 3, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 6, 3, 2, 0>,
+            Conv::template process_tile<1, 1, 6, 3, 2, 1>,
+            Conv::template process_tile<1, 1, 6, 3, 2, 2>,
+            Conv::template process_tile<1, 1, 6, 3, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 6, 3, 3, 0>,
+            Conv::template process_tile<1, 1, 6, 3, 3, 1>,
+            Conv::template process_tile<1, 1, 6, 3, 3, 2>,
+            Conv::template process_tile<1, 1, 6, 3, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 3
+        {  // Input pad right = 4
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 6, 4, 0, 0>,
+            Conv::template process_tile<1, 1, 6, 4, 0, 1>,
+            Conv::template process_tile<1, 1, 6, 4, 0, 2>,
+            Conv::template process_tile<1, 1, 6, 4, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 6, 4, 1, 0>,
+            Conv::template process_tile<1, 1, 6, 4, 1, 1>,
+            Conv::template process_tile<1, 1, 6, 4, 1, 2>,
+            Conv::template process_tile<1, 1, 6, 4, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 6, 4, 2, 0>,
+            Conv::template process_tile<1, 1, 6, 4, 2, 1>,
+            Conv::template process_tile<1, 1, 6, 4, 2, 2>,
+            Conv::template process_tile<1, 1, 6, 4, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 6, 4, 3, 0>,
+            Conv::template process_tile<1, 1, 6, 4, 3, 1>,
+            Conv::template process_tile<1, 1, 6, 4, 3, 2>,
+            Conv::template process_tile<1, 1, 6, 4, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 4
+        {  // Input pad right = 5
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 6, 5, 0, 0>,
+            Conv::template process_tile<1, 1, 6, 5, 0, 1>,
+            Conv::template process_tile<1, 1, 6, 5, 0, 2>,
+            Conv::template process_tile<1, 1, 6, 5, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 6, 5, 1, 0>,
+            Conv::template process_tile<1, 1, 6, 5, 1, 1>,
+            Conv::template process_tile<1, 1, 6, 5, 1, 2>,
+            Conv::template process_tile<1, 1, 6, 5, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 6, 5, 2, 0>,
+            Conv::template process_tile<1, 1, 6, 5, 2, 1>,
+            Conv::template process_tile<1, 1, 6, 5, 2, 2>,
+            Conv::template process_tile<1, 1, 6, 5, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 6, 5, 3, 0>,
+            Conv::template process_tile<1, 1, 6, 5, 3, 1>,
+            Conv::template process_tile<1, 1, 6, 5, 3, 2>,
+            Conv::template process_tile<1, 1, 6, 5, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 5
+        {  // Input pad right = 6
+          {  // Output pad bottom = 0
+            Conv::template process_tile<1, 1, 6, 6, 0, 0>,
+            Conv::template process_tile<1, 1, 6, 6, 0, 1>,
+            Conv::template process_tile<1, 1, 6, 6, 0, 2>,
+            Conv::template process_tile<1, 1, 6, 6, 0, 3>,
+          },  // Output pad bottom = 0
+          {  // Output pad bottom = 1
+            Conv::template process_tile<1, 1, 6, 6, 1, 0>,
+            Conv::template process_tile<1, 1, 6, 6, 1, 1>,
+            Conv::template process_tile<1, 1, 6, 6, 1, 2>,
+            Conv::template process_tile<1, 1, 6, 6, 1, 3>,
+          },  // Output pad bottom = 1
+          {  // Output pad bottom = 2
+            Conv::template process_tile<1, 1, 6, 6, 2, 0>,
+            Conv::template process_tile<1, 1, 6, 6, 2, 1>,
+            Conv::template process_tile<1, 1, 6, 6, 2, 2>,
+            Conv::template process_tile<1, 1, 6, 6, 2, 3>,
+          },  // Output pad bottom = 2
+          {  // Output pad bottom = 3
+            Conv::template process_tile<1, 1, 6, 6, 3, 0>,
+            Conv::template process_tile<1, 1, 6, 6, 3, 1>,
+            Conv::template process_tile<1, 1, 6, 6, 3, 2>,
+            Conv::template process_tile<1, 1, 6, 6, 3, 3>,
+          },  // Output pad bottom = 3
+        },  // Input pad right = 6
+      },  // Input pad bottom = 6
+    },  // Input pad left = 1
+  },  // Input pad top = 1
+};
+
+
+template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float>;
+}  // namespace depthwise

diff --git a/src/core/NEON/kernels/winograd/batched_blocked_gemm.cpp b/src/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.cpp
similarity index 94%
rename from src/core/NEON/kernels/winograd/batched_blocked_gemm.cpp
rename to src/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.cpp
index 52c2db8..ac83bf9 100644
--- a/src/core/NEON/kernels/winograd/batched_blocked_gemm.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.cpp

@@ -22,8 +22,9 @@
  * SOFTWARE.
  */
 
-#include "batched_blocked_gemm.hpp"
-#include "gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/gemm.hpp"
+
 using namespace winograd;
 
 template <const int MB, const int NB, typename TIn, typename TOut>

diff --git a/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp
similarity index 97%
rename from src/core/NEON/kernels/winograd/transforms/input_2x2_3x3_fp32.cpp
rename to src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp
index 381ae92..6d8afc0 100644
--- a/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp

@@ -22,9 +22,9 @@
  * SOFTWARE.
  */
 
-#include "transforms/input.hpp"
-#include "winograd_gemm.hpp"
-#include "arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
 
 namespace winograd
 {

diff --git a/src/core/NEON/kernels/winograd/transforms/input_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_5x5_fp32.cpp
similarity index 67%
copy from src/core/NEON/kernels/winograd/transforms/input_4x4_3x3_fp32.cpp
copy to src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_5x5_fp32.cpp
index 477aaaf..ebc0c07 100644
--- a/src/core/NEON/kernels/winograd/transforms/input_4x4_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_5x5_fp32.cpp

@@ -21,57 +21,32 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
-#include "transforms/input.hpp"
-#include "winograd_gemm.hpp"
-#include "arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
 
 namespace winograd
 {
 
-using Transform = WinogradGEMM<4, 4, 3, 3>::InputTransform<float>;
+using Transform = WinogradGEMM<2, 2, 5, 5>::InputTransform<float>;
 
 template <>
 template <>
 int Transform::ops_performed(const Tensor4DShape &input_shape)
 {
-  // NOTE: Cost in FLOPs rather than instructions or uops.
-  const int tile_M = iceildiv(input_shape.n_rows, inner_tile_rows);
-  const int tile_N = iceildiv(input_shape.n_cols, inner_tile_cols);
-  return 12 * 24 * tile_M * tile_N * input_shape.n_channels;
+  (void) input_shape;
+  return 0;
 }
 
-/* F(4x4, 3x3) implies the use of a 6x6 input tile. Such tiles can require a
-* variety of padding types. For example, tiles at the top and left of an
-* image can require one row or column of padding on their top and left sides
-* if the padding type is SAME (where X represents a padded value):
-*
-*      ___________    ___________
-*     |X X X X X X|  |X X X X X X|
-*     |X          |  |           |
-*     |X          |  |           |
-*     |X          |  |           |
-*     |X          |  |           |
-*     |X__________|  |___________|
-*      ___________
-*     |X          |
-*     |X          |
-*     |X          |
-*     |X          |
-*     |X          |
-*     |X__________|
-*
-* For tiles near the right or bottom of the image it is more complicated.
-* Such tiles might require padding by 0, 1, 2 or 3 rows or columns if the
-* padding type is VALID or 1, 2, 3 or 4 rows or columns if the padding
-* type is SAME.
+/*****************************************************************************
+* F(2x2, 5x5) implies the use of a 6x6 input tile.
 *
 * Build an array of the specialised methods that deal with each of the
 * different padding combinations which may be required. These padding
 * constraints are the space:
 *
-*     Padding top in {0, 1}
-*     Padding left in {0, 1}
+*     Padding top in {0, 2}
+*     Padding left in {0, 2}
 *     Padding bottom in {0, 1, 2, 3, 4}
 *     Padding right in {0, 1, 2, 3, 4}
 */
@@ -321,9 +296,6 @@
   }
 }
 
-/* In the below, unusual or especially small tiles are routed via the slow
- * path whereas common or large tiles are routed through a faster path.
- */
 template <>
 template <>
 const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] =
@@ -368,119 +340,119 @@
     },
     {
       {
-        Transform::template process_tile<0, 1, 0, 0>,  // Left
-        Transform::template process_tile<0, 1, 0, 1>,
-        Transform::template process_tile<0, 1, 0, 2>,
-        Transform::template process_tile<0, 1, 0, 3>,
-        Transform::template process_tile<0, 1, 0, 4>,
+        Transform::template process_tile<0, 2, 0, 0>,  // Left
+        Transform::template process_tile<0, 2, 0, 1>,
+        Transform::template process_tile<0, 2, 0, 2>,
+        Transform::template process_tile<0, 2, 0, 3>,
+        Transform::template process_tile<0, 2, 0, 4>,
       },
       {
-        Transform::template process_tile<0, 1, 1, 0>,  // Bottom left
-        Transform::template process_tile<0, 1, 1, 1>,
-        Transform::template process_tile<0, 1, 1, 2>,
-        Transform::template process_tile<0, 1, 1, 3>,
-        Transform::template process_tile<0, 1, 1, 4>,
+        Transform::template process_tile<0, 2, 1, 0>,  // Bottom left
+        Transform::template process_tile<0, 2, 1, 1>,
+        Transform::template process_tile<0, 2, 1, 2>,
+        Transform::template process_tile<0, 2, 1, 3>,
+        Transform::template process_tile<0, 2, 1, 4>,
       },
       {
-        Transform::template process_tile<0, 1, 2, 0>,  // "          "
-        Transform::template process_tile<0, 1, 2, 1>,
-        Transform::template process_tile<0, 1, 2, 2>,
-        Transform::template process_tile<0, 1, 2, 3>,
-        Transform::template process_tile<0, 1, 2, 4>,
+        Transform::template process_tile<0, 2, 2, 0>,  // "          "
+        Transform::template process_tile<0, 2, 2, 1>,
+        Transform::template process_tile<0, 2, 2, 2>,
+        Transform::template process_tile<0, 2, 2, 3>,
+        Transform::template process_tile<0, 2, 2, 4>,
       },
       {
-        Transform::template process_tile<0, 1, 3, 0>,  // "          "
-        Transform::template process_tile<0, 1, 3, 1>,
-        Transform::template process_tile<0, 1, 3, 2>,
-        Transform::template process_tile<0, 1, 3, 3>,
-        Transform::template process_tile<0, 1, 3, 4>,
+        Transform::template process_tile<0, 2, 3, 0>,  // "          "
+        Transform::template process_tile<0, 2, 3, 1>,
+        Transform::template process_tile<0, 2, 3, 2>,
+        Transform::template process_tile<0, 2, 3, 3>,
+        Transform::template process_tile<0, 2, 3, 4>,
       },
       {
-        Transform::template process_tile<0, 1, 4, 0>,  // "          "
-        Transform::template process_tile<0, 1, 4, 1>,
-        Transform::template process_tile<0, 1, 4, 2>,
-        Transform::template process_tile<0, 1, 4, 3>,
-        Transform::template process_tile<0, 1, 4, 4>,
+        Transform::template process_tile<0, 2, 4, 0>,  // "          "
+        Transform::template process_tile<0, 2, 4, 1>,
+        Transform::template process_tile<0, 2, 4, 2>,
+        Transform::template process_tile<0, 2, 4, 3>,
+        Transform::template process_tile<0, 2, 4, 4>,
       }
     }
   },
   {
     {
       {
-        Transform::template process_tile<1, 0, 0, 0>,  // Top
-        Transform::template process_tile<1, 0, 0, 1>,  // Top right
-        Transform::template process_tile<1, 0, 0, 2>,  // "       "
-        Transform::template process_tile<1, 0, 0, 3>,  // "       "
-        Transform::template process_tile<1, 0, 0, 4>,  // "       "
+        Transform::template process_tile<2, 0, 0, 0>,  // Top
+        Transform::template process_tile<2, 0, 0, 1>,  // Top right
+        Transform::template process_tile<2, 0, 0, 2>,  // "       "
+        Transform::template process_tile<2, 0, 0, 3>,  // "       "
+        Transform::template process_tile<2, 0, 0, 4>,  // "       "
       },
       {
-        Transform::template process_tile<1, 0, 1, 0>,
-        Transform::template process_tile<1, 0, 1, 1>,
-        Transform::template process_tile<1, 0, 1, 2>,
-        Transform::template process_tile<1, 0, 1, 3>,
-        Transform::template process_tile<1, 0, 1, 4>,
+        Transform::template process_tile<2, 0, 1, 0>,
+        Transform::template process_tile<2, 0, 1, 1>,
+        Transform::template process_tile<2, 0, 1, 2>,
+        Transform::template process_tile<2, 0, 1, 3>,
+        Transform::template process_tile<2, 0, 1, 4>,
       },
       {
-        Transform::template process_tile<1, 0, 2, 0>,
-        Transform::template process_tile<1, 0, 2, 1>,
-        Transform::template process_tile<1, 0, 2, 2>,
-        Transform::template process_tile<1, 0, 2, 3>,
-        Transform::template process_tile<1, 0, 2, 4>,
+        Transform::template process_tile<2, 0, 2, 0>,
+        Transform::template process_tile<2, 0, 2, 1>,
+        Transform::template process_tile<2, 0, 2, 2>,
+        Transform::template process_tile<2, 0, 2, 3>,
+        Transform::template process_tile<2, 0, 2, 4>,
       },
       {
-        Transform::template process_tile<1, 0, 3, 0>,
-        Transform::template process_tile<1, 0, 3, 1>,
-        Transform::template process_tile<1, 0, 3, 2>,
-        Transform::template process_tile<1, 0, 3, 3>,
-        Transform::template process_tile<1, 0, 3, 4>,
+        Transform::template process_tile<2, 0, 3, 0>,
+        Transform::template process_tile<2, 0, 3, 1>,
+        Transform::template process_tile<2, 0, 3, 2>,
+        Transform::template process_tile<2, 0, 3, 3>,
+        Transform::template process_tile<2, 0, 3, 4>,
       },
       {
-        Transform::template process_tile<1, 0, 4, 0>,
-        Transform::template process_tile<1, 0, 4, 1>,
-        Transform::template process_tile<1, 0, 4, 2>,
-        Transform::template process_tile<1, 0, 4, 3>,
-        Transform::template process_tile<1, 0, 4, 4>,
+        Transform::template process_tile<2, 0, 4, 0>,
+        Transform::template process_tile<2, 0, 4, 1>,
+        Transform::template process_tile<2, 0, 4, 2>,
+        Transform::template process_tile<2, 0, 4, 3>,
+        Transform::template process_tile<2, 0, 4, 4>,
       },
     },
     {
       {
-        Transform::template process_tile<1, 1, 0, 0>,  // Top left
-        Transform::template process_tile<1, 1, 0, 1>,
-        Transform::template process_tile<1, 1, 0, 2>,
-        Transform::template process_tile<1, 1, 0, 3>,
-        Transform::template process_tile<1, 1, 0, 4>,
+        Transform::template process_tile<2, 2, 0, 0>,  // Top left
+        Transform::template process_tile<2, 2, 0, 1>,
+        Transform::template process_tile<2, 2, 0, 2>,
+        Transform::template process_tile<2, 2, 0, 3>,
+        Transform::template process_tile<2, 2, 0, 4>,
       },
       {
-        Transform::template process_tile<1, 1, 1, 0>,
-        Transform::template process_tile<1, 1, 1, 1>,
-        Transform::template process_tile<1, 1, 1, 2>,
-        Transform::template process_tile<1, 1, 1, 3>,
-        Transform::template process_tile<1, 1, 1, 4>,
+        Transform::template process_tile<2, 2, 1, 0>,
+        Transform::template process_tile<2, 2, 1, 1>,
+        Transform::template process_tile<2, 2, 1, 2>,
+        Transform::template process_tile<2, 2, 1, 3>,
+        Transform::template process_tile<2, 2, 1, 4>,
       },
       {
-        Transform::template process_tile<1, 1, 2, 0>,
-        Transform::template process_tile<1, 1, 2, 1>,
-        Transform::template process_tile<1, 1, 2, 2>,
-        Transform::template process_tile<1, 1, 2, 3>,
-        Transform::template process_tile<1, 1, 2, 4>,
+        Transform::template process_tile<2, 2, 2, 0>,
+        Transform::template process_tile<2, 2, 2, 1>,
+        Transform::template process_tile<2, 2, 2, 2>,
+        Transform::template process_tile<2, 2, 2, 3>,
+        Transform::template process_tile<2, 2, 2, 4>,
       },
       {
-        Transform::template process_tile<1, 1, 3, 0>,
-        Transform::template process_tile<1, 1, 3, 1>,
-        Transform::template process_tile<1, 1, 3, 2>,
-        Transform::template process_tile<1, 1, 3, 3>,
-        Transform::template process_tile<1, 1, 3, 4>,
+        Transform::template process_tile<2, 2, 3, 0>,
+        Transform::template process_tile<2, 2, 3, 1>,
+        Transform::template process_tile<2, 2, 3, 2>,
+        Transform::template process_tile<2, 2, 3, 3>,
+        Transform::template process_tile<2, 2, 3, 4>,
       },
       {
-        Transform::template process_tile<1, 1, 4, 0>,
-        Transform::template process_tile<1, 1, 4, 1>,
-        Transform::template process_tile<1, 1, 4, 2>,
-        Transform::template process_tile<1, 1, 4, 3>,
-        Transform::template process_tile<1, 1, 4, 4>,
+        Transform::template process_tile<2, 2, 4, 0>,
+        Transform::template process_tile<2, 2, 4, 1>,
+        Transform::template process_tile<2, 2, 4, 2>,
+        Transform::template process_tile<2, 2, 4, 3>,
+        Transform::template process_tile<2, 2, 4, 4>,
       }
     }
   }
 };
 
-template struct WinogradGEMM<4, 4, 3, 3>::InputTransform<float>;
+template struct WinogradGEMM<2, 2, 5, 5>::InputTransform<float>;
 }  // namespace winograd

diff --git a/src/core/NEON/kernels/winograd/transforms/input_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_4x4_3x3_fp32.cpp
similarity index 98%
rename from src/core/NEON/kernels/winograd/transforms/input_4x4_3x3_fp32.cpp
rename to src/core/NEON/kernels/convolution/winograd/transforms/input_4x4_3x3_fp32.cpp
index 477aaaf..04d1573 100644
--- a/src/core/NEON/kernels/winograd/transforms/input_4x4_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_4x4_3x3_fp32.cpp

@@ -22,9 +22,9 @@
  * SOFTWARE.
  */
 
-#include "transforms/input.hpp"
-#include "winograd_gemm.hpp"
-#include "arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
 
 namespace winograd
 {

diff --git a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
similarity index 89%
rename from src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp
rename to src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
index e7907d1..a95ce0e 100644
--- a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp

@@ -22,9 +22,9 @@
  * SOFTWARE.
  */
 
-#include "transforms/output.hpp"
-#include "winograd_gemm.hpp"
-#include "arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
 
 namespace winograd
 {
@@ -65,6 +65,7 @@
   const int n_channels,
   const float* const matrix_base,
   const int matrix_stride,
+  const float* const biases,
   float* const output,
   const int output_row_stride,
   const int output_col_stride
@@ -83,6 +84,7 @@
     }
   }
   const float *inptr = matrix_base;
+  const float *bptr = biases;
 
   // For each channel of the output
   int channels_remaining = n_channels;
@@ -90,7 +92,7 @@
   for (; channels_remaining >= 4; channels_remaining -= 4)
   {
     // Matrices used and computed during this transform
-    float32x4_t F[4][4], FZ[4][2], f[2][2];
+    float32x4_t F[4][4], FZ[4][2], f[2][2], b;
 
     // Read a 4x4 tile in the Winograd domain
     for (int i = 0, m = 0; i < 4; i++)
@@ -122,12 +124,16 @@
       f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
     }
 
+    // Load the bias vector
+    b = vld1q_f32(bptr);
+    bptr += 4;
+
     // Write out the output tile
     for (int i = 0; i < cells_i; i++)
     {
       for (int j = 0; j < cells_j; j++)
       {
-        vst1q_f32(outptrs[i][j], f[i][j]);
+        vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
         outptrs[i][j] += 4;
       }
     }
@@ -137,7 +143,7 @@
   for (; channels_remaining >= 2; channels_remaining -= 2)
   {
     // Matrices used and computed during this transform
-    float32x2_t F[4][4], FZ[4][2], f[2][2];
+    float32x2_t F[4][4], FZ[4][2], f[2][2], b;
 
     // Read a 4x4 tile in the Winograd domain
     for (int i = 0, m = 0; i < 4; i++)
@@ -169,12 +175,16 @@
       f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
     }
 
+    // Load the bias vector
+    b = vld1_f32(bptr);
+    bptr += 2;
+
     // Write out the output tile
     for (int i = 0; i < cells_i; i++)
     {
       for (int j = 0; j < cells_j; j++)
       {
-        vst1_f32(outptrs[i][j], f[i][j]);
+        vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
         outptrs[i][j] += 2;
       }
     }
@@ -183,7 +193,7 @@
   for (; channels_remaining; channels_remaining--)
   {
     // Matrices used and computed during this transform
-    float F[4][4], FZ[4][2], f[2][2];
+    float F[4][4], FZ[4][2], f[2][2], b;
 
     // Read a 4x4 tile in the Winograd domain
     for (int i = 0, m = 0; i < 4; i++)
@@ -209,12 +219,15 @@
       f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
     }
 
+    // Load the bias
+    b = *(bptr++);
+
     // Write out the output tile
     for (int i = 0; i < cells_i; i++)
     {
       for (int j = 0; j < cells_j; j++)
       {
-        *(outptrs[i][j]++) = f[i][j];
+        *(outptrs[i][j]++) = f[i][j] + b;
       }
     }
   }

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
new file mode 100644
index 0000000..6bb1674
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp

@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+
+namespace winograd
+{
+
+using Transform = WinogradGEMM<2, 2, 5, 5>::OutputTransform<float>;
+
+template <>
+template <>
+int Transform::ops_performed(const Tensor4DShape &shape)
+{
+  return 0;
+}
+
+/* F(2x2, 5x5) constructs 2x2 output tiles from a 5x5 convolution. Since we use
+ * enough tiles to cover the output space each output tile may contain 0 or 1
+ * padded values to the right and bottom columns or rows of the tile, e.g.:
+ *
+ *      ___     ___
+ *     |   |   |  X|
+ *     |___|   |__X|
+ *
+ *      ___     ___
+ *     |   |   |  X|
+ *     |X_X|   |X_X|
+ *
+ *
+ * We provide a specialised output transform for each of these instances.
+ * Consequently we below construct an array of the various padding options, the
+ * array contains pointers to the specific implementations.
+ */
+template <>
+template <>
+template <int pad_bottom, int pad_right>
+void Transform::process_tile(
+  const int n_channels,
+  const float* const matrix_base,
+  const int matrix_stride,
+  const float* const biases,
+  float* const output,
+  const int output_row_stride,
+  const int output_col_stride
+)
+{
+  constexpr int cells_i = 2 - pad_bottom;
+  constexpr int cells_j = 2 - pad_right;
+
+  // Construct a map to the output cells
+  float *outptrs[cells_i][cells_j];
+  for (int i = 0; i < cells_i; i++)
+  {
+    for (int j = 0; j < cells_j; j++)
+    {
+      outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
+    }
+  }
+  const float *inptr = matrix_base;
+  const float *bptr = biases;
+
+  // For each channel of the output
+  int channels_remaining = n_channels;
+#ifdef __aarch64__
+  for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    // Matrices used and computed during this transform
+    float32x4_t F[6][6], FZ[6][2], f[2][2], b;
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 4;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 6; i++)
+    {
+      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+      // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+      FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 2; j++)
+    {
+      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+      // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+      f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+    }
+
+    // Write out the output tile
+    b = vld1q_f32(bptr);
+    bptr += 4;
+    for (int i = 0; i < cells_i; i++)
+    {
+      for (int j = 0; j < cells_j; j++)
+      {
+        vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
+        outptrs[i][j] += 4;
+      }
+    }
+  }
+#endif  // __aarch64__
+#ifdef __arm_any__
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    // Matrices used and computed during this transform
+    float32x2_t F[6][6], FZ[6][2], f[2][2], b;
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        F[i][j] = vld1_f32(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 2;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 6; i++)
+    {
+      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+      // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+      FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 2; j++)
+    {
+      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+      // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+      f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+    }
+
+    // Write out the output tile
+    b = vld1_f32(bptr);
+    bptr += 2;
+    for (int i = 0; i < cells_i; i++)
+    {
+      for (int j = 0; j < cells_j; j++)
+      {
+        vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+        outptrs[i][j] += 2;
+      }
+    }
+  }
+#endif  // __arm_any__
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Matrices used and computed during this transform
+    float F[6][6], FZ[6][2], f[2][2], b;
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        F[i][j] = *(inptr + m*matrix_stride);
+      }
+    }
+    inptr++;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 6; i++)
+    {
+      FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 2; j++)
+    {
+      f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[1][j] =                1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+    }
+
+    // Write out the output tile
+    b = *(bptr++);
+    for (int i = 0; i < cells_i; i++)
+    {
+      for (int j = 0; j < cells_j; j++)
+      {
+        *(outptrs[i][j]++) = f[i][j] + b;
+      }
+    }
+  }
+}
+
+template <>
+template <>
+const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] =
+{
+  {
+    Transform::template process_tile<0, 0>,  // No padding
+    Transform::template process_tile<0, 1>,  // Right padding
+  },
+  {
+    Transform::template process_tile<1, 0>,  // Bottom padding
+    Transform::template process_tile<1, 1>,  // Bottom and right padding
+  }
+};
+
+template struct WinogradGEMM<2, 2, 5, 5>::OutputTransform<float>;
+}  // namespace winograd

diff --git a/src/core/NEON/kernels/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
similarity index 92%
rename from src/core/NEON/kernels/winograd/transforms/output_4x4_3x3_fp32.cpp
rename to src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
index 5eac334..8f47736 100644
--- a/src/core/NEON/kernels/winograd/transforms/output_4x4_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp

@@ -22,9 +22,9 @@
  * SOFTWARE.
  */
 
-#include "transforms/output.hpp"
-#include "winograd_gemm.hpp"
-#include "arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
 
 namespace winograd
 {
@@ -41,6 +41,9 @@
   return 170 * tile_M * tile_N * shape.n_channels;
 }
 
+// Instantiate cost methods
+template int Transform::ops_performed(const Tensor4DShape&);
+
 /* F(4x4, 3x3) constructs 4x4 output tiles from a 3x3 convolution. Since we use
  * enough tiles to cover the output space each output tile may contain up to 3
  * padded values to the right and bottom columns or rows of the tile, e.g.:
@@ -79,6 +82,7 @@
   const int n_channels,
   const float* const matrix_base,
   const int matrix_stride,
+  const float* const biases,
   float* const output,
   const int output_row_stride,
   const int output_col_stride
@@ -97,6 +101,7 @@
     }
   }
   const float *inptr = matrix_base;
+  const float *bptr = biases;
 
   // For each channel of the output
   int channels_remaining = n_channels;
@@ -104,7 +109,7 @@
   for (; channels_remaining >= 4; channels_remaining -= 4)
   {
     // Matrices used and computed during this transform
-    float32x4_t F[6][6], FZ[6][4], f[4][4];
+    float32x4_t F[6][6], FZ[6][4], f[4][4], b;
 
     // Read a 6x6 tile in the Winograd domain
     for (int i = 0, m = 0; i < 6; i++)
@@ -149,11 +154,13 @@
     }
 
     // Write out the output tile
+    b = vld1q_f32(bptr);
+    bptr += 4;
     for (int i = 0; i < cells_i; i++)
     {
       for (int j = 0; j < cells_j; j++)
       {
-        vst1q_f32(outptrs[i][j], f[i][j]);
+        vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
         outptrs[i][j] += 4;
       }
     }
@@ -163,7 +170,7 @@
   for (; channels_remaining >= 2; channels_remaining -= 2)
   {
     // Matrices used and computed during this transform
-    float32x2_t F[6][6], FZ[6][4], f[4][4];
+    float32x2_t F[6][6], FZ[6][4], f[4][4], b;
 
     // Read a 6x6 tile in the Winograd domain
     for (int i = 0, m = 0; i < 6; i++)
@@ -208,11 +215,13 @@
     }
 
     // Write out the output tile
+    b = vld1_f32(bptr);
+    bptr += 2;
     for (int i = 0; i < cells_i; i++)
     {
       for (int j = 0; j < cells_j; j++)
       {
-        vst1_f32(outptrs[i][j], f[i][j]);
+        vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
         outptrs[i][j] += 2;
       }
     }
@@ -221,7 +230,7 @@
   for (; channels_remaining; channels_remaining--)
   {
     // Matrices used and computed during this transform
-    float F[6][6], FZ[6][4], f[4][4];
+    float F[6][6], FZ[6][4], f[4][4], b;
 
     // Read a 6x6 tile in the Winograd domain
     for (int i = 0, m = 0; i < 6; i++)
@@ -252,11 +261,12 @@
     }
 
     // Write out the output tile
+    b = *(bptr++);
     for (int i = 0; i < cells_i; i++)
     {
       for (int j = 0; j < cells_j; j++)
       {
-        *(outptrs[i][j]++) = f[i][j];
+        *(outptrs[i][j]++) = f[i][j] + b;
       }
     }
   }

diff --git a/src/core/NEON/kernels/winograd/transforms/weights_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_3x3_fp32.cpp
similarity index 96%
rename from src/core/NEON/kernels/winograd/transforms/weights_2x2_3x3_fp32.cpp
rename to src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_3x3_fp32.cpp
index c0b2824..6c71461 100644
--- a/src/core/NEON/kernels/winograd/transforms/weights_2x2_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_3x3_fp32.cpp

@@ -22,9 +22,9 @@
  * SOFTWARE.
  */
 
-#include "arm.hpp"
-#include "winograd_gemm.hpp"
-#include "transforms/kernel.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
 
 namespace winograd
 {

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp
new file mode 100644
index 0000000..76393c1
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp

@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
+
+namespace winograd
+{
+  template <>
+  template <>
+  void WinogradGEMM<2, 2, 5, 5>::WeightsTransform<float>::execute(
+    const int n_output_channels,
+    const int n_input_channels,
+    const float* const input,
+    float* const output,
+    const int matrix_stride,
+    const int matrix_row_stride
+  )
+  {
+    // Get pointers to each cell of the weight tensor
+    const auto weight_col_stride = n_input_channels * n_output_channels;
+    const auto weight_row_stride = 5 * weight_col_stride;
+    const float *inptrs[5][5];
+    for (int i = 0; i < 5; i++)
+    {
+      for (int j = 0; j < 5; j++)
+      {
+        inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
+      }
+    }
+
+    // For each input channel
+    for (int ic = 0; ic < n_input_channels; ic++)
+    {
+      float *outptr = output + ic * matrix_row_stride;
+
+      // For each output channel
+      int channels_remaining = n_output_channels;
+#ifdef __aarch64__
+      for (; channels_remaining >= 4; channels_remaining -= 4)
+      {
+        // Matrices used and computed in this kernel
+        float32x4_t w[5][5], Ww[6][5], V[6][6];
+
+        // Read weights
+        for (int i = 0; i < 5; i++)
+        {
+          for (int j = 0; j < 5; j++)
+          {
+            w[i][j] = vld1q_f32(inptrs[i][j]);
+            inptrs[i][j] += 4;
+          }
+        }
+
+        // Compute the matrix W w
+        for (int j = 0; j < 5; j++)
+        {
+          // Ww[0][j] = w[0][j]/4.0f;
+          Ww[0][j] = vmulq_n_f32(w[0][j], 1.0f/4.0f);
+
+          // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
+          Ww[1][j] = vmulq_n_f32(
+            vaddq_f32(
+              vaddq_f32(
+                vaddq_f32(w[1][j], w[0][j]),
+                vaddq_f32(w[3][j], w[2][j])
+              ),
+              w[4][j]
+            ),
+            -1.0f/6.0f
+          );
+
+          // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
+          // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f;
+          Ww[2][j] = vmulq_n_f32(
+            vsubq_f32(
+              vaddq_f32(
+                vsubq_f32(w[1][j], w[0][j]),
+                vsubq_f32(w[3][j], w[2][j])
+              ),
+              w[4][j]
+            ),
+            1.0f/6.0f
+          );
+
+          // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
+          Ww[3][j] = vmulq_n_f32(
+            vmlaq_n_f32(
+              vaddq_f32(
+                vaddq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)),
+                vaddq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
+              ),
+              w[4][j], 2.0f
+            ),
+            1.0f/3.0f
+          );
+
+          // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
+          Ww[4][j] = vmulq_n_f32(
+            vmlaq_n_f32(
+              vaddq_f32(
+                vsubq_f32(vmulq_n_f32(w[0][j], 1.0f/8.0f), vmulq_n_f32(w[1][j], 1.0f/4.0f)),
+                vsubq_f32(vmulq_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
+              ),
+              w[4][j], 2.0f
+            ),
+            1.0f/3.0f
+          );
+
+          // Ww[5][j] = w[4][j];
+          Ww[5][j] = w[4][j];
+        }
+
+        // Compute V = W w WT
+        for (int i = 0; i < 6; i++)
+        {
+          // V[i][0] = Ww[i][0]/4.0f;
+          V[i][0] = vmulq_n_f32(Ww[i][0], 1.0f/4.0f);
+
+          // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
+          V[i][1] = vmulq_n_f32(
+            vaddq_f32(
+              vaddq_f32(
+                vaddq_f32(Ww[i][1], Ww[i][0]),
+                vaddq_f32(Ww[i][3], Ww[i][2])
+              ),
+              Ww[i][4]
+            ),
+            -1.0f/6.0f
+          );
+
+          // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
+          // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f;
+          V[i][2] = vmulq_n_f32(
+            vsubq_f32(
+              vaddq_f32(
+                vsubq_f32(Ww[i][1], Ww[i][0]),
+                vsubq_f32(Ww[i][3], Ww[i][2])
+              ),
+              Ww[i][4]
+            ),
+            1.0f/6.0f
+          );
+
+          // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
+          V[i][3] = vmulq_n_f32(
+            vmlaq_n_f32(
+              vaddq_f32(
+                vaddq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)),
+                vaddq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
+              ),
+              Ww[i][4], 2.0f
+            ),
+            1.0f/3.0f
+          );
+
+          // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
+          V[i][4] = vmulq_n_f32(
+            vmlaq_n_f32(
+              vaddq_f32(
+                vsubq_f32(vmulq_n_f32(Ww[i][0], 1.0f/8.0f), vmulq_n_f32(Ww[i][1], 1.0f/4.0f)),
+                vsubq_f32(vmulq_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
+              ),
+              Ww[i][4], 2.0f
+            ),
+            1.0f/3.0f
+          );
+
+          // V[i][5] = Ww[i][4];
+          V[i][5] = Ww[i][4];
+        }
+
+        // Store the transformed weights
+        for (int i = 0, m = 0; i < 6; i++)
+        {
+          for (int j = 0; j < 6; j++, m++)
+          {
+            vst1q_f32(outptr + m*matrix_stride, V[i][j]);
+          }
+        }
+        outptr += 4;
+      }
+#endif  // __aarch64__
+#ifdef __arm_any__
+      for (; channels_remaining >= 2; channels_remaining -= 2)
+      {
+        // Matrices used and computed in this kernel
+        float32x2_t w[5][5], Ww[6][5], V[6][6];
+
+        // Read weights
+        for (int i = 0; i < 5; i++)
+        {
+          for (int j = 0; j < 5; j++)
+          {
+            w[i][j] = vld1_f32(inptrs[i][j]);
+            inptrs[i][j] += 2;
+          }
+        }
+
+        // Compute the matrix W w
+        for (int j = 0; j < 5; j++)
+        {
+          // Ww[0][j] = w[0][j]/4.0f;
+          Ww[0][j] = vmul_n_f32(w[0][j], 1.0f/4.0f);
+
+          // Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
+          Ww[1][j] = vmul_n_f32(
+            vadd_f32(
+              vadd_f32(
+                vadd_f32(w[1][j], w[0][j]),
+                vadd_f32(w[3][j], w[2][j])
+              ),
+              w[4][j]
+            ),
+            -1.0f/6.0f
+          );
+
+          // Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
+          // Ww[2][j] = ((w[1][j] - w[0][j]) + (w[3][j] - w[2][j]) - w[4][j])/6.0f;
+          Ww[2][j] = vmul_n_f32(
+            vsub_f32(
+              vadd_f32(
+                vsub_f32(w[1][j], w[0][j]),
+                vsub_f32(w[3][j], w[2][j])
+              ),
+              w[4][j]
+            ),
+            1.0f/6.0f
+          );
+
+          // Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
+          Ww[3][j] = vmul_n_f32(
+            vmla_n_f32(
+              vadd_f32(
+                vadd_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)),
+                vadd_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
+              ),
+              w[4][j], 2.0f
+            ),
+            1.0f/3.0f
+          );
+
+          // Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
+          Ww[4][j] = vmul_n_f32(
+            vmla_n_f32(
+              vadd_f32(
+                vsub_f32(vmul_n_f32(w[0][j], 1.0f/8.0f), vmul_n_f32(w[1][j], 1.0f/4.0f)),
+                vsub_f32(vmul_n_f32(w[2][j], 1.0f/2.0f), w[3][j])
+              ),
+              w[4][j], 2.0f
+            ),
+            1.0f/3.0f
+          );
+
+          // Ww[5][j] = w[4][j];
+          Ww[5][j] = w[4][j];
+        }
+
+        // Compute V = W w WT
+        for (int i = 0; i < 6; i++)
+        {
+          // V[i][0] = Ww[i][0]/4.0f;
+          V[i][0] = vmul_n_f32(Ww[i][0], 1.0f/4.0f);
+
+          // V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
+          V[i][1] = vmul_n_f32(
+            vadd_f32(
+              vadd_f32(
+                vadd_f32(Ww[i][1], Ww[i][0]),
+                vadd_f32(Ww[i][3], Ww[i][2])
+              ),
+              Ww[i][4]
+            ),
+            -1.0f/6.0f
+          );
+
+          // V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
+          // V[i][2] = ((Ww[i][1] - Ww[i][0]) + (Ww[i][3] - Ww[i][2]) - Ww[i][4])/6.0f;
+          V[i][2] = vmul_n_f32(
+            vsub_f32(
+              vadd_f32(
+                vsub_f32(Ww[i][1], Ww[i][0]),
+                vsub_f32(Ww[i][3], Ww[i][2])
+              ),
+              Ww[i][4]
+            ),
+            1.0f/6.0f
+          );
+
+          // V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
+          V[i][3] = vmul_n_f32(
+            vmla_n_f32(
+              vadd_f32(
+                vadd_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)),
+                vadd_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
+              ),
+              Ww[i][4], 2.0f
+            ),
+            1.0f/3.0f
+          );
+
+          // V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
+          V[i][4] = vmul_n_f32(
+            vmla_n_f32(
+              vadd_f32(
+                vsub_f32(vmul_n_f32(Ww[i][0], 1.0f/8.0f), vmul_n_f32(Ww[i][1], 1.0f/4.0f)),
+                vsub_f32(vmul_n_f32(Ww[i][2], 1.0f/2.0f), Ww[i][3])
+              ),
+              Ww[i][4], 2.0f
+            ),
+            1.0f/3.0f
+          );
+
+          // V[i][5] = Ww[i][4];
+          V[i][5] = Ww[i][4];
+        }
+
+        // Store the transformed weights
+        for (int i = 0, m = 0; i < 6; i++)
+        {
+          for (int j = 0; j < 6; j++, m++)
+          {
+            vst1_f32(outptr + m*matrix_stride, V[i][j]);
+          }
+        }
+        outptr += 2;
+      }
+#endif  // __arm_any__
+      for (; channels_remaining; channels_remaining--)
+      {
+        // Matrices used and computed in this kernel
+        float w[5][5], Ww[6][5], V[6][6];
+
+        // Read weights
+        for (int i = 0; i < 5; i++)
+        {
+          for (int j = 0; j < 5; j++)
+          {
+            w[i][j] = *(inptrs[i][j]++);
+          }
+        }
+
+        // Compute the matrix W w
+        for (int j = 0; j < 5; j++)
+        {
+          Ww[0][j] = w[0][j]/4.0f;
+          Ww[1][j] = -( w[0][j] + w[1][j] + w[2][j] + w[3][j] + w[4][j])/6.0f;
+          Ww[2][j] = +(-w[0][j] + w[1][j] - w[2][j] + w[3][j] - w[4][j])/6.0f;
+          Ww[3][j] = (w[0][j]/8.0f + w[1][j]/4.0f + w[2][j]/2.0f + w[3][j] + 2*w[4][j])/3.0f;
+          Ww[4][j] = (w[0][j]/8.0f - w[1][j]/4.0f + w[2][j]/2.0f - w[3][j] + 2*w[4][j])/3.0f;
+          Ww[5][j] = w[4][j];
+        }
+
+        // Compute V = W w WT
+        for (int i = 0; i < 6; i++)
+        {
+          V[i][0] = Ww[i][0]/4.0f;
+          V[i][1] = -( Ww[i][0] + Ww[i][1] + Ww[i][2] + Ww[i][3] + Ww[i][4])/6.0f;
+          V[i][2] = +(-Ww[i][0] + Ww[i][1] - Ww[i][2] + Ww[i][3] - Ww[i][4])/6.0f;
+          V[i][3] = (Ww[i][0]/8.0f + Ww[i][1]/4.0f + Ww[i][2]/2.0f + Ww[i][3] + 2*Ww[i][4])/3.0f;
+          V[i][4] = (Ww[i][0]/8.0f - Ww[i][1]/4.0f + Ww[i][2]/2.0f - Ww[i][3] + 2*Ww[i][4])/3.0f;
+          V[i][5] = Ww[i][4];
+        }
+
+        // Store the transformed weights
+        for (int i = 0, m = 0; i < 6; i++)
+        {
+          for (int j = 0; j < 6; j++, m++)
+          {
+            *(outptr + m*matrix_stride) = V[i][j];
+          }
+        }
+        outptr++;
+      }
+    }
+  }
+
+  template <>
+  template <>
+  int WinogradGEMM<2, 2, 5, 5>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
+  {
+    return 0;
+  }
+
+  template class WinogradGEMM<2, 2, 5, 5>::WeightsTransform<float>;
+}  // namespace winograd

diff --git a/src/core/NEON/kernels/winograd/transforms/weights_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_4x4_3x3_fp32.cpp
similarity index 97%
rename from src/core/NEON/kernels/winograd/transforms/weights_4x4_3x3_fp32.cpp
rename to src/core/NEON/kernels/convolution/winograd/transforms/weights_4x4_3x3_fp32.cpp
index de659c3..a56a475 100644
--- a/src/core/NEON/kernels/winograd/transforms/weights_4x4_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/weights_4x4_3x3_fp32.cpp

@@ -22,9 +22,9 @@
  * SOFTWARE.
  */
 
-#include "arm.hpp"
-#include "winograd_gemm.hpp"
-#include "transforms/kernel.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
 
 namespace winograd
 {

diff --git a/src/core/NEON/kernels/winograd/winograd_gemm.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
similarity index 96%
rename from src/core/NEON/kernels/winograd/winograd_gemm.cpp
rename to src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
index b44a453..c082356 100644
--- a/src/core/NEON/kernels/winograd/winograd_gemm.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp

@@ -21,8 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "winograd_gemm.hpp"
-#include "batched_blocked_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.hpp"
+
 using namespace winograd;
 
 /** Get the output shape of a convolution. */
@@ -34,11 +35,10 @@
   const PaddingType padding
 )
 {
-  // TODO Accept different kernel sizes
   return Tensor4DShape {
     in_shape.n_batches,
-    (padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - 2,
-    (padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - 2,
+  (padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - (kernel_rows - 1),
+  (padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - (kernel_cols - 1),
     kernel_shape.n_output_channels,
     in_shape.ordering
   };
@@ -372,6 +372,7 @@
 Convolution<TOut, TIn>::execute(
   TOut* const output,
   const TIn* const input,
+  const TOut* const biases,
   void *working_space,
   const int n_threads
 )
@@ -408,9 +409,6 @@
 
   // If we need to re-order the input and output tensors then the final chunk
   // of the working space can be used for this purpose.
-  // TODO  - Overlay the input reorder on top of the output matrices
-  //       - Overlay the output reorder on top of the input matrices
-  // Reorder the input input form if it was not provided in this ordering.
   const TIn* input_nhwc = input;
   if (input_shape.ordering == NCHW)
   {
@@ -479,7 +477,11 @@
     kernel_matrices[0],
     output_matrices[0]
   );
-  gemms.run(0, gemms.get_window());
+  for (unsigned int i = 0; i < gemms.get_window(); i++)
+  {
+    auto run_gemm = [&] () { gemms.run(i, i+1); };
+    prof("GEMM", run_gemm, 0, 0, 0);
+  }
 
   // If the output tensor needs to be in NCHW form then store the NHWC output
   // tensor in temporary storage and then reorder. If the output tensor needs
@@ -498,6 +500,7 @@
     output_matrices[0],
     out_matrix_stride_bytes / sizeof(TOut),
     out_matrix_row_stride,
+    biases,
     output_nhwc,
     output_shape.n_batches,
     output_shape.n_rows,
@@ -548,13 +551,16 @@
 Convolution<TOut, TIn>::execute(
   TOut* const output,
   const TIn* const input,
+  const TOut* const biases,
   const int n_threads
 )
 {
-  execute(output, input, NULL, n_threads);
+  execute(output, input, biases, NULL, n_threads);
 }
 
 
 // Instantiate required implementations
 template class WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>;
 template class WinogradGEMM<4, 4, 3, 3>::Convolution<float, float>;
+
+template class WinogradGEMM<2, 2, 5, 5>::Convolution<float, float>;

diff --git a/src/core/NEON/kernels/winograd/winograd_layer.cpp b/src/core/NEON/kernels/winograd/winograd_layer.cpp
deleted file mode 100644
index 689ecba..0000000
--- a/src/core/NEON/kernels/winograd/winograd_layer.cpp
+++ /dev/null

@@ -1,204 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "convolution.hpp"
-#include "winograd_layer.hpp"
-#include "tensor.hpp"
-
-
-/** Determine how much memory (in units of TIn) to allocate for the transformed
- * weights.
- */
-template <
-  int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols,
-  typename TIn, typename TOut
->
-unsigned int WinogradConvolutionLayer<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, TIn, TOut
->::get_weight_storage_size(
-  const int n_output_channels,  /** Number of output feature maps. */
-  const int n_input_channels    /** Number of input feature maps. */
-)
-{
-  const KernelShape shape(
-    n_output_channels, KernelRows, KernelCols, n_input_channels
-  );
-  return static_cast<unsigned int>(
-    // WinogradConv returns the size in bytes, we divide by `sizeof(TIn)` to
-    // express that in units of TIn.
-    WinogradConv::get_kernel_storage_size(shape) / sizeof(TIn)
-  );
-}
-
-
-/** Determine how much memory (in units of TIn) to allocate for the transformed
- * input.
- */
-template <
-  int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols,
-  typename TIn, typename TOut
->
-unsigned int WinogradConvolutionLayer<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, TIn, TOut
->::get_input_storage_size(
-  const int n_batches,     /** Number of batches in the input tensor. */
-  const int n_channels,    /** Number of feature maps in the input tensor. */
-  const int n_rows,        /** Number of rows in each feature map. */
-  const int n_cols,        /** Number of columns in each feature map. */
-  const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
-)
-{
-  // Construct shapes for the input and kernel tensors.
-  const Tensor4DShape input_shape(n_batches, n_rows, n_cols, n_channels);
-  const KernelShape kern_shape(1, KernelRows, KernelCols, n_channels);
-  const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
-
-  // Return the size, converted into units of TIn
-  return static_cast<unsigned int>(
-    WinogradConv::get_input_storage_size(kern_shape, input_shape, padding) /
-    sizeof(TIn)
-  );
-}
-
-
-/** Determine how much memory (in units of TOut) to allocate for the (Winograd
- * domain) output.
- */
-template <
-  int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols,
-  typename TIn, typename TOut
->
-unsigned int WinogradConvolutionLayer<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, TIn, TOut
->::get_output_storage_size(
-  const int n_batches,          /** Number of batches in the output tensor. */
-  const int n_rows,             /** Number of rows in each feature map of the input tensor. */
-  const int n_cols,             /** Number of columns in each feature map of the input tensor. */
-  const int n_output_channels,  /** Number of feature maps in the output tensor. */
-  const bool same_padding       /** Use "SAME" padding, otherwise use "VALID". */
-)
-{
-  // Construct shapes for the input and kernel tensors.
-  const Tensor4DShape input_shape(n_batches, n_rows, n_cols, 1);
-  const KernelShape kern_shape(n_output_channels, KernelRows, KernelCols, 1);
-  const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
-
-  // Return the size, converted into units of TOut
-  return static_cast<unsigned int>(
-    WinogradConv::get_output_storage_size(kern_shape, input_shape, padding) /
-    sizeof(TOut)
-  );
-}
-
-
-/** Get the shape (rows, cols) of a feature map of the output tensor. */
-template <
-  int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols,
-  typename TIn, typename TOut
->
-std::pair<int, int> WinogradConvolutionLayer<
-  OutputTileRows, OutputTileCols, KernelRows, KernelCols, TIn, TOut
->::get_output_feature_map_shape(
-  const int n_input_rows,  /** Number of rows in the input feature map. */
-  const int n_input_cols,  /** Number of columns in the input feature map. */
-  const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
-)
-{
-  // Construct shapes for the input and kernel tensors.
-  const Tensor4DShape input_shape(1, n_input_rows, n_input_cols, 1);
-  const KernelShape kern_shape(1, KernelRows, KernelCols, 1);
-  const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
-
-  // Compute the new shape
-  const auto output_shape = WinogradConv::get_output_shape(
-    kern_shape, input_shape, padding
-  );
-
-  return std::make_pair(output_shape.n_rows, output_shape.n_cols);
-}
-
-
-/** Create a new Winograd convolution layer.
- */
-template <
-  int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols,
-  typename TIn, typename TOut
->
-WinogradConvolutionLayer<OutputTileRows, OutputTileCols, KernelRows, KernelCols, TIn, TOut>::
-WinogradConvolutionLayer(
-  const int n_batches,          /** Number of batches in the input and output tensors. */
-  const int n_input_channels,   /** Number of feature maps in a batch of the input tensor. */
-  const int n_input_rows,       /** Number of rows in a feature map of the input tensor. */
-  const int n_input_cols,       /** Number of columns in a feature map of the input tensor. */
-  const int n_output_channels,  /** Number of feature maps in the output tensor. */
-  const bool same_padding,      /** Use "SAME" padding, otherwise use "VALID". */
-  const TIn* const weights,     /** Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. */
-  TIn* const winograd_weights,  /** Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size`. */
-  const TIn* const input,       /** Pointer to NHWC ordered input tensor, in the spatial domain. */
-  TIn* const winograd_input,    /** Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. */
-  TOut* const output,           /** Pointer to NHWC ordered output tensor, in the spatial domain. */
-  TOut* const winograd_output   /** Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */
-) : _kernel_shape(n_output_channels, KernelRows, KernelCols, n_input_channels),
-    _input_shape(n_batches, n_input_rows, n_input_cols, n_input_channels),
-    _padding(same_padding ? PADDING_SAME : PADDING_VALID),
-    _output_shape(WinogradConv::get_output_shape(_kernel_shape, _input_shape, _padding)),
-    _n_output_rows(_output_shape.n_rows),
-    _n_output_cols(_output_shape.n_cols),
-    _kernel_matrix_stride(WinogradConv::get_kernel_matrix_stride(_kernel_shape)),
-    _kernel_matrix_row_stride(roundup(n_output_channels, WinogradConv::N_BLOCK)),
-    _input_matrix_stride(WinogradConv::get_input_matrix_stride(_kernel_shape, _input_shape, _padding)),
-    _input_matrix_row_stride(n_input_channels),
-    _output_matrix_stride(WinogradConv::get_output_matrix_stride(_kernel_shape, _input_shape, _padding)),
-    _output_matrix_row_stride(_kernel_matrix_row_stride),
-    _tile_rows(iceildiv(_n_output_rows, OutputTileRows)),
-    _tile_cols(iceildiv(_n_output_cols, OutputTileCols)),
-    _m(n_batches * _tile_rows * _tile_cols),
-    _k(n_input_channels),
-    _n(n_output_channels),
-    weights_transform(
-      weights, winograd_weights,
-      _kernel_matrix_stride, _kernel_matrix_row_stride,
-      n_output_channels, n_input_channels
-    ),
-    input_transform(
-      input, n_batches, n_input_rows, n_input_cols, n_input_channels, _padding,
-      winograd_input, _input_matrix_stride, _input_matrix_row_stride
-    ),
-    gemms(
-      WinogradBase::N_GEMMS, _m, _k, _n,
-      _input_matrix_stride, _input_matrix_row_stride,
-      _kernel_matrix_stride, _kernel_matrix_row_stride,
-      _output_matrix_stride, _output_matrix_row_stride,
-      winograd_input, winograd_weights, winograd_output
-    ),
-    output_transform(
-      winograd_output, _output_matrix_stride, _output_matrix_row_stride,
-      output, n_batches, _n_output_rows, _n_output_cols, n_output_channels
-    )
-{
-}
-
-// Instantiate valid implementations.
-template class WinogradConvolutionLayer<2, 2, 3, 3, float, float>;
-template class WinogradConvolutionLayer<4, 4, 3, 3, float, float>;

diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp
index 0150a95..836c379 100644
--- a/src/core/SubTensorInfo.cpp
+++ b/src/core/SubTensorInfo.cpp

@@ -92,7 +92,7 @@
     return clone_obj;
 }
 
-ITensorInfo &SubTensorInfo::set_tensor_shape(TensorShape shape)
+ITensorInfo &SubTensorInfo::set_tensor_shape(const TensorShape &shape)
 {
     ARM_COMPUTE_ERROR_ON(_parent == nullptr);
 

diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index 24988e2..bd0c85f 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp

@@ -348,7 +348,7 @@
     return *this;
 }
 
-ITensorInfo &TensorInfo::set_tensor_shape(TensorShape shape)
+ITensorInfo &TensorInfo::set_tensor_shape(const TensorShape &shape)
 {
     _tensor_shape                  = shape;
     _offset_first_element_in_bytes = 0;
@@ -378,7 +378,7 @@
     return *this;
 }
 
-ITensorInfo &TensorInfo::set_quantization_info(QuantizationInfo quantization_info)
+ITensorInfo &TensorInfo::set_quantization_info(const QuantizationInfo &quantization_info)
 {
     _quantization_info = quantization_info;
     return *this;

diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 83a843d..f4b4553 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp

@@ -250,6 +250,21 @@
     return res;
 }
 
+PadStrideInfo arm_compute::calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info)
+{
+    const auto &strides         = conv_info.stride();
+    const int   out_width       = std::ceil(float(input_shape.x()) / float(strides.first));
+    const int   out_height      = std::ceil(float(input_shape.y()) / float(strides.second));
+    const int   pad_width       = ((out_width - 1) * strides.first + weights_shape.x() - input_shape.x());
+    const int   pad_height      = ((out_height - 1) * strides.second + weights_shape.y() - input_shape.y());
+    const int   same_pad_left   = pad_width / 2;
+    const int   same_pad_top    = pad_height / 2;
+    const int   same_pad_right  = pad_width - same_pad_left;
+    const int   same_pad_bottom = pad_height - same_pad_top;
+
+    return PadStrideInfo(strides.first, strides.second, same_pad_left, same_pad_right, same_pad_top, same_pad_bottom, DimensionRoundingType::CEIL);
+}
+
 TensorShape arm_compute::deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, TensorShape input, TensorShape weights)
 {
     TensorShape out_shape(input);

diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
index f495e48..f5f9f1f 100644
--- a/src/core/Validate.cpp
+++ b/src/core/Validate.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -82,7 +82,7 @@
 {
     for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(win[i].start() != 0 || win[i].end() != win[i].step(),
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((win[i].start() != 0) || (win[i].end() != win[i].step()),
                                             function, file, line,
                                             "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i);
     }

diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index ac5316f..b6c6822 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,6 +62,7 @@
     std::unique_ptr<INode>                      _current_node{ nullptr };
     ITensorObject                              *_current_output{ nullptr };
     bool                                        _info_enabled{ false };
+    CLTuner                                     _tuner{};
 
 private:
     ITensorObject *_current_input{ nullptr };
@@ -76,10 +77,22 @@
 Graph::Graph()
     : _pimpl{ new Private() }
 {
+    graph_init();
+}
+
+void Graph::graph_init(const bool use_cl_tuner)
+{
     // Check if OpenCL is available and initialize the scheduler
     if(opencl_is_available())
     {
-        arm_compute::CLScheduler::get().default_init();
+        if(use_cl_tuner)
+        {
+            arm_compute::CLScheduler::get().default_init(&_pimpl->_tuner);
+        }
+        else
+        {
+            arm_compute::CLScheduler::get().default_init();
+        }
     }
 }
 
@@ -119,6 +132,11 @@
         _previous_hints = _current_hints; // For the first node just assume the previous node was of the same type as this one
     }
 
+    if(_current_node->supports_in_place())
+    {
+        _current_output = _current_input;
+    }
+
     //Automatic output configuration ?
     if(_current_output == nullptr)
     {
@@ -140,8 +158,12 @@
     _ctx.hints()                                 = _current_hints;
     std::unique_ptr<arm_compute::IFunction> func = _current_node->instantiate_node(_ctx, _current_input, _current_output);
 
-    // Allocate current input
-    _current_input->allocate();
+    // If the operation is done in-place, do not allocate or it will prevent following layers from performing the configuration
+    if(!_current_node->supports_in_place())
+    {
+        // Allocate current input
+        _current_input->allocate();
+    }
 
     // Map input if needed
     if(_current_input->target() == TargetHint::OPENCL)
@@ -215,11 +237,25 @@
         _pimpl->_graph_output->allocate();
     }
 }
+
 bool Graph::opencl_is_available()
 {
     return arm_compute::opencl_is_available();
 }
 
+arm_compute::GPUTarget Graph::gpu_target()
+{
+    // Check if OpenCL is available before returning the GPU target
+    if(opencl_is_available())
+    {
+        return arm_compute::CLScheduler::get().target();
+    }
+    else
+    {
+        return GPUTarget::MIDGARD;
+    }
+}
+
 void Graph::set_temp(TensorInfo &&tmp)
 {
     ARM_COMPUTE_ERROR_ON(_pimpl->_graph_input == nullptr);

diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp
index 582f936..c753f66 100644
--- a/src/graph/INode.cpp
+++ b/src/graph/INode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,6 +39,14 @@
     ARM_COMPUTE_ERROR_ON(target_hint == TargetHint::OPENCL && !opencl_is_available());
     return target_hint;
 }
+bool INode::supports_in_place() const
+{
+    return _supports_in_place;
+}
+void INode::set_supports_in_place(bool value)
+{
+    _supports_in_place = value;
+}
 GraphHints INode::node_override_hints(GraphHints hints) const
 {
     TargetHint target_hint = hints.target_hint();

diff --git a/src/graph/SubGraph.cpp b/src/graph/SubGraph.cpp
index 8ba2af6..4065e1d 100644
--- a/src/graph/SubGraph.cpp
+++ b/src/graph/SubGraph.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,6 +66,10 @@
     }
     graph->add_tensor_object(std::move(_input));
 
+    // Make sure first and last nodes of the subgraph always do operations out-of-place
+    _nodes.front()->set_supports_in_place(false);
+    _nodes.back()->set_supports_in_place(false);
+
     // Construct nodes
     for(auto &node : _nodes)
     {

diff --git a/src/graph/nodes/ActivationLayer.cpp b/src/graph/nodes/ActivationLayer.cpp
index 54f30ef..546c42a 100644
--- a/src/graph/nodes/ActivationLayer.cpp
+++ b/src/graph/nodes/ActivationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,6 +33,7 @@
 ActivationLayer::ActivationLayer(const ActivationLayerInfo activation_info)
     : _activation_info(activation_info)
 {
+    set_supports_in_place(true);
 }
 
 std::unique_ptr<arm_compute::IFunction> ActivationLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)

diff --git a/src/graph/nodes/BatchNormalizationLayer.cpp b/src/graph/nodes/BatchNormalizationLayer.cpp
index 7851aa5..24287ac 100644
--- a/src/graph/nodes/BatchNormalizationLayer.cpp
+++ b/src/graph/nodes/BatchNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,6 +77,7 @@
     node_ctx.add_input(_gamma.tensor());
     node_ctx.add_output(out);
     node_ctx.add_parameter<float>("epsilon", _epsilon);
+    node_ctx.add_parameter<ActivationLayerInfo>("act_info", _act_info);
 
     // Configure operation
     auto func = OperationRegistry::get().find_operation(OperationType::BatchNormalizationLayer, _target_hint)->configure(node_ctx);

diff --git a/src/graph/nodes/DepthwiseConvolutionLayer.cpp b/src/graph/nodes/DepthwiseConvolutionLayer.cpp
index 1209d03..e5101cc 100644
--- a/src/graph/nodes/DepthwiseConvolutionLayer.cpp
+++ b/src/graph/nodes/DepthwiseConvolutionLayer.cpp

@@ -40,10 +40,8 @@
 
     if(_weights.tensor() == nullptr)
     {
-        TensorShape shape = in->info()->tensor_shape();
-        shape.set(Window::DimX, _conv_width);
-        shape.set(Window::DimY, _conv_height);
-        TensorInfo info = TensorInfo(TensorShape(shape), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position());
+        TensorShape weights_shape(_conv_width, _conv_height, input->tensor()->info()->tensor_shape().z());
+        TensorInfo  info = TensorInfo(TensorShape(weights_shape), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position());
         info.set_quantization_info(_quant_info);
         _weights.set_info(std::move(info));
     }

diff --git a/src/graph/nodes/FullyConnectedLayer.cpp b/src/graph/nodes/FullyConnectedLayer.cpp
index 219e0f9..3742150 100644
--- a/src/graph/nodes/FullyConnectedLayer.cpp
+++ b/src/graph/nodes/FullyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/ResidualLayer.cpp b/src/graph/nodes/ResidualLayer.cpp
new file mode 100644
index 0000000..87404f9
--- /dev/null
+++ b/src/graph/nodes/ResidualLayer.cpp

@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ResidualLayer.h"
+
+#include "arm_compute/graph/Error.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistry.h"
+#include "arm_compute/graph/SubGraph.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "support/ToolchainSupport.h"
+#include "utils/Utils.h"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+
+using namespace arm_compute::graph;
+
+/** Residual function */
+class ResidualFunction final : public arm_compute::IFunction
+{
+public:
+    /** Default Constructor */
+    ResidualFunction(GraphContext &ctx, ITensorObject *output)
+        : _ctx(ctx), _input(nullptr), _output(output), _func(nullptr), _graphs(), _graph_outputs()
+    {
+    }
+
+    /** Prevent instances from being copy constructed */
+    ResidualFunction(const ResidualFunction &) = delete;
+    /** Prevent instances from being copy assigned */
+    const ResidualFunction &operator=(const ResidualFunction &) = delete;
+    /** Prevent instances from being move constructed */
+    ResidualFunction(ResidualFunction &&) = delete;
+    /** Prevent instances from being move assigned */
+    ResidualFunction &operator=(ResidualFunction &&) = delete;
+    /** Default destructor */
+    ~ResidualFunction() override = default;
+
+    /** Set the input (when using only one sub graph)
+     *
+     * @param[in] input Input to set
+     */
+    void set_input(std::unique_ptr<ITensorObject> input)
+    {
+        _input = std::move(input);
+    }
+
+    /** Registers graph to be executed by the residual function
+     *
+     * @param[in] graph  Graph to register
+     * @param[in] output Output to register
+     */
+    void register_graph(std::unique_ptr<Graph> graph, std::unique_ptr<ITensorObject> output)
+    {
+        _graphs.push_back(std::move(graph));
+        _graph_outputs.push_back(std::move(output));
+    }
+
+    /** Configure the function */
+    void configure()
+    {
+        ARM_COMPUTE_ERROR_ON(_graphs.size() < 1 || _graphs.size() > 2);
+        TargetHint target_hint = _ctx.hints().target_hint();
+
+        // Create node context
+        NodeContext node_ctx(OperationType::ArithmeticAddition);
+        node_ctx.set_target(target_hint);
+
+        if(_graphs.size() == 1)
+        {
+            arm_compute::ITensor *in = _input->tensor();
+            node_ctx.add_input(in);
+        }
+
+        for(auto &o : _graph_outputs)
+        {
+            arm_compute::ITensor *in = o->tensor();
+            node_ctx.add_input(in);
+        }
+
+        arm_compute::ITensor *out = _output->tensor();
+        auto_init_if_empty(*out->info(), *_graph_outputs[0]->tensor()->info());
+        node_ctx.add_output(out);
+
+        _func = OperationRegistry::get().find_operation(OperationType::ArithmeticAddition, target_hint)->configure(node_ctx);
+
+        for(auto &o : _graph_outputs)
+        {
+            o->allocate();
+        }
+    }
+
+    // Inherited methods overriden:
+    void run() override
+    {
+        ARM_COMPUTE_ERROR_ON(_graphs.size() < 1 || _graphs.size() > 2);
+
+        for(auto &g : _graphs)
+        {
+            ARM_COMPUTE_ERROR_ON(g.get() == nullptr);
+            g->run();
+        }
+
+        _func->run();
+    }
+
+private:
+    GraphContext                                _ctx;
+    std::unique_ptr<ITensorObject>              _input;
+    ITensorObject                              *_output;
+    std::unique_ptr<arm_compute::IFunction>     _func;
+    std::vector<std::unique_ptr<Graph>>         _graphs;
+    std::vector<std::unique_ptr<ITensorObject>> _graph_outputs;
+};
+
+std::unique_ptr<arm_compute::IFunction> ResidualLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
+{
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<Tensor *>(input) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<Tensor *>(output) == nullptr);
+
+    // Create residual function
+    auto func = arm_compute::support::cpp14::make_unique<ResidualFunction>(ctx, output);
+
+    if(_sub_graphs.size() == 1)
+    {
+        std::unique_ptr<ITensorObject> original_in;
+        original_in = arm_compute::support::cpp14::make_unique<SubTensor>(*dynamic_cast<Tensor *>(input),
+                                                                          input->tensor()->info()->tensor_shape(),
+                                                                          Coordinates());
+        func->set_input(std::move(original_in));
+    }
+
+    // Constuct all sub-graphs given the input/output
+    for(auto &sg : _sub_graphs)
+    {
+        ARM_COMPUTE_ERROR_ON(sg.get() == nullptr);
+
+        // IO buffers
+        std::unique_ptr<ITensorObject> in;
+        std::unique_ptr<ITensorObject> out;
+        std::unique_ptr<ITensorObject> func_in;
+
+        // Create input sub-tensor
+        if(!sg->has_input())
+        {
+            in = arm_compute::support::cpp14::make_unique<SubTensor>(*dynamic_cast<Tensor *>(input),
+                                                                     input->tensor()->info()->tensor_shape(),
+                                                                     Coordinates());
+        }
+
+        // Create output sub-tensor
+        if(!sg->has_output())
+        {
+            ITensorInfo *info = input->tensor()->info();
+            func_in           = arm_compute::support::cpp14::make_unique<Tensor>(TensorInfo(info->num_channels(), info->data_type(), info->fixed_point_position()));
+            func_in->set_target(ctx.hints().target_hint());
+            out = arm_compute::support::cpp14::make_unique<SubTensor>(func_in->tensor(),
+                                                                      TensorShape(),
+                                                                      Coordinates(0, 0, 0),
+                                                                      func_in->target(),
+                                                                      true);
+        }
+
+        // Construct sub_graph
+        auto g = sg->construct(ctx, std::move(in), std::move(out));
+
+        // Register graph to function
+        func->register_graph(std::move(g), std::move(func_in));
+    }
+
+    func->configure();
+
+    return std::move(func);
+}

diff --git a/src/graph/operations/CLSimpleOperations.cpp b/src/graph/operations/CLSimpleOperations.cpp
index 61315e7..fe56122 100644
--- a/src/graph/operations/CLSimpleOperations.cpp
+++ b/src/graph/operations/CLSimpleOperations.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,6 +66,34 @@
     return std::move(activation);
 }
 
+/* Arithmetic addition */
+REGISTER_SIMPLE_OPERATION(CLArithmeticAdditionOperation, OPENCL, OperationType::ArithmeticAddition)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 2);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto *in1 = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
+    auto *in2 = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1));
+    auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
+
+    auto addition = arm_compute::support::cpp14::make_unique<arm_compute::CLArithmeticAddition>();
+    addition->configure(in1, in2, out, ConvertPolicy::SATURATE);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLArithmeticAddition"
+                               << " Data Type: " << in1->info()->data_type()
+                               << " Input 1 shape: " << in1->info()->tensor_shape()
+                               << " Input 2 shape: " << in2->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(addition);
+}
+
 /* Batch Normalization Layer */
 REGISTER_SIMPLE_OPERATION(CLBatchNormalizationLayerOperation, OPENCL, OperationType::BatchNormalizationLayer)
 {
@@ -79,17 +107,18 @@
     ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
 
     // Extract IO and info
-    auto      *in      = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
-    auto      *mean    = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1));
-    auto      *var     = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(2));
-    auto      *beta    = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(3));
-    auto      *gamma   = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(4));
-    auto      *out     = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-    const auto epsilon = ctx.parameter<float>("epsilon");
+    auto      *in       = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
+    auto      *mean     = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1));
+    auto      *var      = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(2));
+    auto      *beta     = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(3));
+    auto      *gamma    = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(4));
+    auto      *out      = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
+    const auto epsilon  = ctx.parameter<float>("epsilon");
+    const auto act_info = ctx.parameter<ActivationLayerInfo>("act_info");
 
     // Create and configure function
     auto batch_norm = arm_compute::support::cpp14::make_unique<arm_compute::CLBatchNormalizationLayer>();
-    batch_norm->configure(in, out, mean, var, beta, gamma, epsilon);
+    batch_norm->configure(in, out, mean, var, beta, gamma, epsilon, act_info);
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLBatchNormalizationLayer"
@@ -101,6 +130,9 @@
                                << " Beta shape: " << beta->info()->tensor_shape()
                                << " Gamma shape: " << gamma->info()->tensor_shape()
                                << " Epsilon: " << epsilon
+                               << " Activation function: " << act_info.activation()
+                               << " a: " << act_info.a()
+                               << " b: " << act_info.b()
                                << std::endl);
 
     return std::move(batch_norm);
@@ -460,4 +492,4 @@
                                << std::endl);
 
     return std::move(smx);
-}
\ No newline at end of file
+}

diff --git a/src/graph/operations/NESimpleOperations.cpp b/src/graph/operations/NESimpleOperations.cpp
index 49adbe9..4154b9a 100644
--- a/src/graph/operations/NESimpleOperations.cpp
+++ b/src/graph/operations/NESimpleOperations.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,6 +66,34 @@
     return std::move(activation);
 }
 
+/* Arithmetic addition */
+REGISTER_SIMPLE_OPERATION(NEArithmeticAdditionOperation, NEON, OperationType::ArithmeticAddition)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 2);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(1)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto *in1 = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
+    auto *in2 = dynamic_cast<arm_compute::ITensor *>(ctx.input(1));
+    auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
+
+    auto addition = arm_compute::support::cpp14::make_unique<arm_compute::NEArithmeticAddition>();
+    addition->configure(in1, in2, out, ConvertPolicy::SATURATE);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEArithmeticAddition"
+                               << " Data Type: " << in1->info()->data_type()
+                               << " Input 1 shape: " << in1->info()->tensor_shape()
+                               << " Input 2 shape: " << in2->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(addition);
+}
+
 /* Batch Normalization Layer */
 REGISTER_SIMPLE_OPERATION(NEBatchNormalizationLayerOperation, NEON, OperationType::BatchNormalizationLayer)
 {
@@ -79,17 +107,18 @@
     ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
 
     // Extract IO and info
-    auto      *in      = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
-    auto      *mean    = dynamic_cast<arm_compute::ITensor *>(ctx.input(1));
-    auto      *var     = dynamic_cast<arm_compute::ITensor *>(ctx.input(2));
-    auto      *beta    = dynamic_cast<arm_compute::ITensor *>(ctx.input(3));
-    auto      *gamma   = dynamic_cast<arm_compute::ITensor *>(ctx.input(4));
-    auto      *out     = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-    const auto epsilon = ctx.parameter<float>("epsilon");
+    auto      *in       = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
+    auto      *mean     = dynamic_cast<arm_compute::ITensor *>(ctx.input(1));
+    auto      *var      = dynamic_cast<arm_compute::ITensor *>(ctx.input(2));
+    auto      *beta     = dynamic_cast<arm_compute::ITensor *>(ctx.input(3));
+    auto      *gamma    = dynamic_cast<arm_compute::ITensor *>(ctx.input(4));
+    auto      *out      = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
+    const auto epsilon  = ctx.parameter<float>("epsilon");
+    const auto act_info = ctx.parameter<ActivationLayerInfo>("act_info");
 
     // Create and configure function
     auto batch_norm = arm_compute::support::cpp14::make_unique<arm_compute::NEBatchNormalizationLayer>();
-    batch_norm->configure(in, out, mean, var, beta, gamma, epsilon);
+    batch_norm->configure(in, out, mean, var, beta, gamma, epsilon, act_info);
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEBatchNormalizationLayer"
@@ -101,6 +130,9 @@
                                << " Beta shape: " << beta->info()->tensor_shape()
                                << " Gamma shape: " << gamma->info()->tensor_shape()
                                << " Epsilon: " << epsilon
+                               << " Activation function: " << act_info.activation()
+                               << " a: " << act_info.a()
+                               << " b: " << act_info.b()
                                << std::endl);
 
     return std::move(batch_norm);
@@ -149,12 +181,23 @@
     auto      *biases    = ctx.num_inputs() == 3 ? dynamic_cast<arm_compute::ITensor *>(ctx.input(2)) : nullptr;
     auto      *out       = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
     const auto conv_info = ctx.parameter<PadStrideInfo>("ConvolutionInfo");
+    const auto opt3x3    = ctx.parameter<bool>("Optimized3x3");
 
     // Create and configure function
     std::unique_ptr<arm_compute::IFunction> func;
-    auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthwiseConvolutionLayer>();
-    depwthwise_conv->configure(in, weights, biases, out, conv_info);
-    func = std::move(depwthwise_conv);
+    bool                                    run_3x3_opt = opt3x3 && weights->info()->dimension(0) == 3;
+    if(run_3x3_opt)
+    {
+        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthwiseConvolutionLayer3x3>();
+        depwthwise_conv->configure(in, weights, biases, out, conv_info);
+        func = std::move(depwthwise_conv);
+    }
+    else
+    {
+        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthwiseConvolutionLayer>();
+        depwthwise_conv->configure(in, weights, biases, out, conv_info);
+        func = std::move(depwthwise_conv);
+    }
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEDepthwiseConvolutionLayer"

diff --git a/src/runtime/CL/CLMultiImage.cpp b/src/runtime/CL/CLMultiImage.cpp
index 63059cb..92254f3 100644
--- a/src/runtime/CL/CLMultiImage.cpp
+++ b/src/runtime/CL/CLMultiImage.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 
 using namespace arm_compute;
@@ -51,7 +52,8 @@
 
 void CLMultiImage::internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding)
 {
-    TensorInfo info(width, height, Format::U8);
+    TensorShape shape = adjust_odd_shape(TensorShape{ width, height }, format);
+    TensorInfo  info(shape, Format::U8);
 
     if(auto_padding)
     {
@@ -72,7 +74,7 @@
         case Format::YUYV422:
         case Format::UYVY422:
         {
-            TensorInfo info_full(width, height, format);
+            TensorInfo info_full(shape, format);
 
             if(auto_padding)
             {
@@ -85,7 +87,8 @@
         case Format::NV12:
         case Format::NV21:
         {
-            TensorInfo info_uv88(width / 2, height / 2, Format::UV88);
+            const TensorShape shape_uv88 = calculate_subsampled_shape(shape, Format::UV88);
+            TensorInfo        info_uv88(shape_uv88, Format::UV88);
 
             if(auto_padding)
             {
@@ -98,7 +101,8 @@
         }
         case Format::IYUV:
         {
-            TensorInfo info_sub2(width / 2, height / 2, Format::U8);
+            const TensorShape shape_sub2 = calculate_subsampled_shape(shape, Format::IYUV);
+            TensorInfo        info_sub2(shape_sub2, Format::U8);
 
             if(auto_padding)
             {
@@ -120,7 +124,7 @@
             break;
     }
 
-    _info.init(width, height, format);
+    _info.init(shape.x(), shape.y(), format);
 }
 
 void CLMultiImage::allocate()

diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 7f5be86..cf5b5bc 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,21 +24,47 @@
 #include "arm_compute/runtime/CL/CLTuner.h"
 
 #include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include <chrono>
 #include <limits>
 #include <string>
 
 using namespace arm_compute;
 
 CLTuner::CLTuner()
-    : _lws_table()
+    : real_function(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event()
 {
 }
 
+void CLTuner::set_cl_kernel_event(cl_event kernel_event)
+{
+    _kernel_event = kernel_event;
+}
+
 void CLTuner::tune_kernel(ICLKernel &kernel)
 {
+    if(real_function == nullptr)
+    {
+        real_function = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
+
+        // Get the default queue
+        _queue = CLScheduler::get().queue();
+
+        // Check if we can use the OpenCL timer with the default queue
+        cl_command_queue_properties props = _queue.getInfo<CL_QUEUE_PROPERTIES>();
+
+        if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
+        {
+            // Set the queue for profiling
+            _queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
+        }
+        else
+        {
+            _queue_profiler = _queue;
+        }
+    }
+
     // Get the configuration ID from the kernel
     const std::string &config_id = kernel.config_id();
 
@@ -49,6 +75,9 @@
 
         if(p == _lws_table.end())
         {
+            // Set profiler queue
+            CLScheduler::get().set_queue(_queue_profiler);
+
             // Find the optimal LWS for the kernel
             cl::NDRange opt_lws = find_optimal_lws(kernel);
 
@@ -57,6 +86,9 @@
 
             // Set Local-Workgroup-Size
             kernel.set_lws_hint(opt_lws);
+
+            // Restore queue
+            CLScheduler::get().set_queue(_queue);
         }
         else
         {
@@ -68,41 +100,78 @@
 
 cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
 {
-    cl::CommandQueue q = CLScheduler::get().queue();
+    // Start intercepting enqueues:
+    CLSymbols::get().clEnqueueNDRangeKernel_ptr = Interceptor(*this);
 
-    double min_exec_time = std::numeric_limits<double>::max();
+    cl_ulong min_exec_time = std::numeric_limits<cl_ulong>::max();
 
-    cl::NDRange opt_lws = cl::NDRange(1, 1);
+    cl::NDRange opt_lws = cl::NullRange;
 
-    for(int y = 1; y <= 16; ++y)
+    const int x_step = std::max(1, kernel.window().x().step());
+    const int y_step = std::max(1, kernel.window().y().step());
+    const int z_step = std::max(1, kernel.window().z().step());
+    const int x_end  = kernel.window().x().end() - kernel.window().x().start() / x_step > 1 ? 16 : 1;
+    const int y_end  = kernel.window().y().end() - kernel.window().y().start() / y_step > 1 ? 16 : 1;
+    const int z_end  = kernel.window().z().end() - kernel.window().z().start() / z_step > 1 ? 8 : 1;
+
+    // First run using the default LWS
     {
-        for(int x = 1; x <= 16; ++x)
+        cl::NDRange lws_test = cl::NullRange;
+
+        kernel.set_lws_hint(lws_test);
+
+        // Run the kernel
+        kernel.run(kernel.window(), _queue_profiler);
+
+        CLScheduler::get().sync();
+
+        const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+        const cl_ulong end   = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+        const cl_ulong diff  = end - start;
+
+        min_exec_time = diff;
+    }
+
+    for(int z = 1; z <= z_end; ++z)
+    {
+        for(int y = 1; y <= y_end; ++y)
         {
-            cl::NDRange lws_test = cl::NDRange(x, y);
-
-            //Set the Local-Workgroup-Size
-            kernel.set_lws_hint(lws_test);
-
-            auto t_start = std::chrono::high_resolution_clock::now();
-
-            // Run
-            kernel.run(kernel.window(), q);
-
-            CLScheduler::get().sync();
-
-            auto t_stop = std::chrono::high_resolution_clock::now();
-
-            std::chrono::duration<double, std::nano> fp_nano = t_stop - t_start;
-
-            // Check the execution time
-            if(fp_nano.count() < min_exec_time)
+            for(int x = 1; x <= x_end; ++x)
             {
-                min_exec_time = fp_nano.count();
-                opt_lws       = cl::NDRange(x, y);
+                cl::NDRange lws_test = cl::NDRange(x, y, z);
+
+                const bool invalid_lws = (x * y * z > static_cast<int>(kernel.get_max_workgroup_size())) || (x == 1 && y == 1 && z == 1);
+
+                if(invalid_lws)
+                {
+                    continue;
+                }
+
+                //Set the Local-Workgroup-Size
+                kernel.set_lws_hint(lws_test);
+
+                // Run the kernel
+                kernel.run(kernel.window(), _queue_profiler);
+
+                CLScheduler::get().sync();
+
+                const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+                const cl_ulong end   = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+                const cl_ulong diff  = end - start;
+
+                // Check the execution time
+                if(diff < min_exec_time)
+                {
+                    min_exec_time = diff;
+                    opt_lws       = cl::NDRange(x, y, z);
+                }
             }
         }
     }
 
+    // Restore real function
+    CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_function;
+
     return opt_lws;
 }
 
@@ -115,4 +184,24 @@
 const std::unordered_map<std::string, cl::NDRange> &CLTuner::export_lws_table()
 {
     return _lws_table;
-}
\ No newline at end of file
+}
+
+Interceptor::Interceptor(CLTuner &tuner)
+    : _tuner(tuner)
+{
+}
+
+cl_int Interceptor::operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+                               const cl_event *event_wait_list, cl_event *event)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
+    ARM_COMPUTE_UNUSED(event);
+
+    cl_event tmp;
+    cl_int   retval = _tuner.real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
+
+    // Set OpenCL event
+    _tuner.set_cl_kernel_event(tmp);
+
+    return retval;
+}

diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index eaf2ca5..4aeb3a1 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLArithmeticAddition.cpp b/src/runtime/CL/functions/CLArithmeticAddition.cpp
index 5c2e582..0b05058 100644
--- a/src/runtime/CL/functions/CLArithmeticAddition.cpp
+++ b/src/runtime/CL/functions/CLArithmeticAddition.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h"
 
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
 #include "support/ToolchainSupport.h"
 
@@ -30,11 +31,21 @@
 
 using namespace arm_compute;
 
-void CLArithmeticAddition::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticAdditionKernel>();
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
 }
 
 Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)

diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index 58215c3..f87ea6e 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,17 +37,18 @@
 {
 }
 
-void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon)
+void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon,
+                                          ActivationLayerInfo act_info)
 {
-    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
+    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
 Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
                                            const ITensorInfo *mean, const ITensorInfo *var,
                                            const ITensorInfo *beta, const ITensorInfo *gamma,
-                                           float epsilon)
+                                           float epsilon, ActivationLayerInfo act_info)
 {
-    return CLBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon);
+    return CLBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
 void CLBatchNormalizationLayer::run()

diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index b3af11e..1a486ce 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp

@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
 
 #include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
@@ -35,319 +35,86 @@
 #include <tuple>
 
 using namespace arm_compute;
-
-CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
-{
-}
-
-void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose1xW)
-{
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(weights->info()->data_type()));
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
-    }
-
-    const bool       append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
-    const unsigned   bias_element  = (append_biases) ? 1 : 0;
-    const ICLTensor *biases_to_use = (append_biases) ? biases : nullptr;
-
-    _transpose1xW = transpose1xW;
-
-    if(transpose1xW)
-    {
-        // Create tensor to store the reshaped weights
-        const unsigned int mat_weights_cols = weights->info()->dimension(3);
-        const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
-        TensorShape        shape_wr(mat_weights_cols, mat_weights_rows);
-        const DataType     dt                   = weights->info()->data_type();
-        const int          fixed_point_position = weights->info()->fixed_point_position();
-        TensorInfo         info_wr(shape_wr, 1, dt, fixed_point_position);
-
-        _weights_reshaped.allocator()->init(info_wr);
-        _memory_group.manage(&_weights_reshaped);
-        _weights_reshape_kernel.configure(weights, biases_to_use, &_weights_reshaped);
-        _weights_transposed_kernel.configure(&_weights_reshaped, output);
-        _weights_reshaped.allocator()->allocate();
-    }
-    else
-    {
-        _weights_reshape_kernel.configure(weights, biases_to_use, output);
-    }
-
-    output->info()->set_quantization_info(weights->info()->quantization_info());
-}
-
-void CLConvolutionLayerReshapeWeights::run()
-{
-    _memory_group.acquire();
-
-    CLScheduler::get().enqueue(_weights_reshape_kernel);
-    if(_transpose1xW)
-    {
-        CLScheduler::get().enqueue(_weights_transposed_kernel);
-    }
-
-    _memory_group.release();
-}
+using namespace arm_compute::misc::shape_calculator;
 
 CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _interleave_kernel(), _mm_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
-      _col2im_kernel(), _im2col_output(), _interleave_output(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _are_weights_reshaped(false), _is_quantized(false),
-      _is_interleaved_transposed(false)
+    : _memory_manager(std::move(memory_manager)), _function()
 {
 }
 
-void CLConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool is_interleaved_transposed, bool are_weights_reshaped)
+void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
 {
-    if(_is_quantized)
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info));
+
+    switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
+                                                      weights_info, CLScheduler::get().target()))
     {
-        if(are_weights_reshaped)
+        case ConvolutionMethod::DIRECT:
         {
-            ARM_COMPUTE_ERROR("Weights already reshaped are not suppported with gemmlowp");
+            auto f = arm_compute::support::cpp14::make_unique<CLDirectConvolutionLayer>();
+            f->configure(input, weights, biases, output, conv_info);
+            _function = std::move(f);
+            break;
         }
-        else
+        case ConvolutionMethod::GEMM:
         {
-            // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-            // Extract and negate input and weights offset
-            const QuantizationInfo input_quantization_info   = input->info()->quantization_info();
-            const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
-
-            input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
-            weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
-
-            _mm_gemmlowp.configure(input, weights, output, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
-
-            // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
-            input->info()->set_quantization_info(input_quantization_info);
-            weights->info()->set_quantization_info(weights_quantization_info);
+            auto f = arm_compute::support::cpp14::make_unique<CLGEMMConvolutionLayer>(_memory_manager);
+            f->configure(input, weights, biases, output, conv_info, weights_info);
+            _function = std::move(f);
+            break;
         }
-    }
-    else
-    {
-        if(are_weights_reshaped)
-        {
-            // Configure matrix multiply kernel
-            _mm_kernel.configure(input, weights, output, 1.f, is_interleaved_transposed);
-        }
-        else
-        {
-            // Configure matrix multiply function
-            _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
-        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
     }
 }
 
-void CLConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                    const WeightsInfo &weights_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
-    ARM_COMPUTE_ERROR_ON(weights_info.are_reshaped() && CLScheduler::get().target() == GPUTarget::BIFROST);
-    ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
-    ARM_COMPUTE_ERROR_ON(weights_info.are_reshaped() && is_data_type_quantized_asymmetric(input->info()->data_type()));
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
 
-    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+    //Configure if the parameters match the direct convolution or the gemm-based
+    const GPUTarget gpu_target = CLScheduler::get().target();
 
-    if(biases != nullptr)
+    switch(CLConvolutionLayer::get_convolution_method(input, weights, biases, output, conv_info, weights_info, gpu_target))
     {
-        if(_is_quantized)
+        case ConvolutionMethod::DIRECT:
         {
-            ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+            // Validate direct convolution layer
+            CLDirectConvolutionLayer::validate(input, weights, biases, output, conv_info);
+            break;
         }
-        else
+        case ConvolutionMethod::GEMM:
         {
-            ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+            // Validate gemm-based convolution layer
+            CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info);
+            break;
         }
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
-        ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+        default:
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
     }
 
-    const DataType dt = input->info()->data_type();
+    return Status{};
+}
 
-    // Set the GPU target for matrix multiply and im2col and col2im
-    _mm_kernel.set_target(CLScheduler::get().target());
-    _im2col_kernel.set_target(CLScheduler::get().target());
-    _col2im_kernel.set_target(CLScheduler::get().target());
+ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                                             const WeightsInfo &weights_info, const GPUTarget gpu_target)
+{
+    ARM_COMPUTE_UNUSED(input);
+    ARM_COMPUTE_UNUSED(weights);
+    ARM_COMPUTE_UNUSED(biases);
+    ARM_COMPUTE_UNUSED(output);
+    ARM_COMPUTE_UNUSED(conv_info);
+    ARM_COMPUTE_UNUSED(weights_info);
+    ARM_COMPUTE_UNUSED(gpu_target);
 
-    const bool append_bias = (biases != nullptr) && (!_is_quantized);
-    _are_weights_reshaped  = weights_info.are_reshaped();
-
-    const unsigned   bias_element  = (append_bias) ? 1 : 0;
-    const ICLTensor *biases_to_use = (append_bias) ? biases : nullptr;
-
-    // Get parameters from conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    std::tie(stride_x, stride_y) = conv_info.stride();
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-
-    const unsigned int kernel_width  = (_are_weights_reshaped) ? weights_info.kernel_size().first : weights->info()->dimension(0);
-    const unsigned int kernel_height = (_are_weights_reshaped) ? weights_info.kernel_size().second : weights->info()->dimension(1);
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
-                                                 conv_info);
-
-    // Check if its a "fully connected" convolution
-    const bool is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
-    _is_interleaved_transposed                = (!is_fully_connected_convolution) && (!_is_quantized) && (_are_weights_reshaped);
-
-    unsigned int mat_weights_cols = weights->info()->dimension(3);
-    unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
-
-    // Reshape weights if needed
-    if(_are_weights_reshaped)
-    {
-        if(is_fully_connected_convolution || _is_quantized)
-        {
-            mat_weights_cols = weights->info()->dimension(0);
-            mat_weights_rows = weights->info()->dimension(1);
-        }
-        else
-        {
-            mat_weights_cols                         = weights_info.num_kernels();
-            const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
-            mat_weights_rows                         = quarter_reshaped_cols + bias_element;
-        }
-    }
-    else
-    {
-        // _weights_reshaped will be auto configured in the kernel.
-        // Just append biases and do not transpose 1xW as it will be reshaped in CLGEMM
-        _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, false);
-
-        weights = &_weights_reshaped;
-    }
-
-    // Create tensor to store im2col reshaped inputs
-    const unsigned int mat_input_cols = mat_weights_rows;
-    const unsigned int mat_input_rows = conv_w * conv_h;
-    TensorShape        shape_im2col   = input->info()->tensor_shape();
-    shape_im2col.set(0, mat_input_cols);
-    shape_im2col.set(1, mat_input_rows);
-    shape_im2col.set(2, 1);
-    //input->clone() doesn't work with subtensors for grouped convolutions.
-    TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->info()->fixed_point_position());
-    im2col_reshaped_info.set_quantization_info(input->info()->quantization_info());
-    _im2col_output.allocator()->init(im2col_reshaped_info);
-    _memory_group.manage(&_im2col_output);
-
-    // Create GEMM output tensor
-    TensorShape shape_gemm = _im2col_output.info()->tensor_shape();
-    shape_gemm.set(0, mat_weights_cols);
-    shape_gemm.set(1, mat_input_rows);
-    const DataType gemm_data_type = _is_quantized ? DataType::S32 : dt;
-    // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
-    //input->clone() doesn't work with subtensors for grouped convolutions.
-    TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
-    info_gemm.set_quantization_info(output->info()->quantization_info());
-    _gemm_output.allocator()->init(info_gemm);
-    _memory_group.manage(&_gemm_output);
-
-    // Configure im2col
-    _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias);
-
-    // Configure matrix multiply
-    if(_is_interleaved_transposed)
-    {
-        // Configure GEMMInterleave4x4. _input_interleaved_reshaped will be auto configured in the kernel
-        _interleave_kernel.configure(&_im2col_output, &_interleave_output);
-        _memory_group.manage(&_interleave_output);
-
-        // Configure GEMM
-        configure_mm(&_interleave_output, weights, &_gemm_output, true, _are_weights_reshaped);
-        _interleave_output.allocator()->allocate();
-    }
-    else
-    {
-        configure_mm(&_im2col_output, weights, &_gemm_output, false, _are_weights_reshaped);
-    }
-    _im2col_output.allocator()->allocate();
-
-    // Configure output stage for quantized case
-    if(_is_quantized)
-    {
-        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
-        int   output_multiplier, output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        _gemmlowp_output_stage.configure(&_gemm_output, biases, &_tmp_output, output_multiplier, output_shift, output->info()->quantization_info().offset);
-        _gemm_output.allocator()->allocate();
-    }
-
-    // Configure Col2Im
-    _col2im_kernel.configure(_is_quantized ? &_tmp_output : &_gemm_output, output, std::make_pair(conv_w, conv_h));
-    if(_is_quantized)
-    {
-        _tmp_output.allocator()->allocate();
-    }
-    else
-    {
-        _gemm_output.allocator()->allocate();
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
-
-    // Allocate intermediate tensor
-    if(!_are_weights_reshaped)
-    {
-        _weights_reshaped.allocator()->allocate();
-    }
+    return ConvolutionMethod::GEMM;
 }
 
 void CLConvolutionLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(!_are_weights_reshaped)
-    {
-        _are_weights_reshaped = true;
-        _reshape_weights.run();
-    }
-
-    _memory_group.acquire();
-
-    // Run im2col
-    CLScheduler::get().enqueue(_im2col_kernel);
-
-    // Note: _is_interleaved_transposed is true only if the weights passed to the function have been passed already reshaped
-    //       and if we do not have QASYMM8 data type. If this flag is true, we need to run the
-    //       gemm kernel instead of gemm function
-    if(_is_interleaved_transposed)
-    {
-        // Run interleave4x4 kernel
-        CLScheduler::get().enqueue(_interleave_kernel);
-
-        // Run matrix multiply kernel
-        CLScheduler::get().enqueue(_mm_kernel);
-    }
-    else
-    {
-        // Runs CLGEMM or CLGEMMLowpMatrixMultiplyCore functions
-        if(_is_quantized)
-        {
-            // Run gemmlowp
-            _mm_gemmlowp.run();
-
-            // Run output stage
-            _gemmlowp_output_stage.run();
-        }
-        else
-        {
-            // Run gemm
-            _mm_gemm.run();
-        }
-    }
-
-    // Reshape output matrix
-    CLScheduler::get().enqueue(_col2im_kernel, false);
-
-    _memory_group.release();
+    _function->run();
 }

diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 1c55722..e3bbe0f 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,6 +49,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
 
     const unsigned int stride_x = info.stride().first;
     const unsigned int stride_y = info.stride().second;
@@ -79,7 +80,7 @@
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(inner_border_right, inner_border_top), info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, info, WeightsInfo()));
 
     return Status{};
 }

diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 02273fe..88e9376 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,10 +25,14 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
+using namespace arm_compute::misc;
+using namespace arm_compute::misc::shape_calculator;
 
 CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3()
     : _kernel(), _border_handler()
@@ -37,7 +41,7 @@
 
 void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
     _kernel.set_target(CLScheduler::get().target());
@@ -59,14 +63,14 @@
 }
 
 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
-    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(), _weights_reshaped(),
-      _v2mm_output()
+    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
+      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_quantized(false)
 {
 }
 
 void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
 
@@ -74,15 +78,20 @@
     const size_t weights_h = weights->info()->dimension(1);
     const size_t weights_z = weights->info()->dimension(2);
 
-    const bool      has_bias   = (biases != nullptr);
-    const GPUTarget gpu_target = CLScheduler::get().target();
+    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
 
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info);
+    bool            append_bias = (biases != nullptr) && !_is_quantized;
+    const GPUTarget gpu_target  = CLScheduler::get().target();
+
+    // Calculate output shape
+    TensorShape dwc_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info);
+
+    // Output width and height
+    const unsigned int conv_w = dwc_output_shape.x();
+    const unsigned int conv_h = dwc_output_shape.y();
 
     // Set up intermediate tensors
-    const size_t patch_size = weights_w * weights_h + ((has_bias) ? 1 : 0);
+    const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
     const size_t conv_size  = conv_w * conv_h;
 
     // Im2Col configuration
@@ -90,33 +99,52 @@
     shape_im2col.set(0, patch_size);
     shape_im2col.set(1, conv_size);
     shape_im2col.set(2, weights_z);
-    const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position());
-    _input_reshaped.allocator()->init(info_im2col);
+    _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
     _im2col_kernel.set_target(gpu_target);
-    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, has_bias);
+    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias);
 
     // Weights reshape configuration
     const TensorShape shape_weights_reshape(patch_size, weights_z);
-    const TensorInfo  info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
-    _weights_reshaped.allocator()->init(info_weights_reshape);
-    _weights_reshape_kernel.configure(weights, &_weights_reshaped, biases);
+    _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+    _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
 
     // GEMV configuration
+    DataType    v2mm_dt        = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
     TensorShape shape_v2mm_out = input->info()->tensor_shape();
     shape_v2mm_out.set(0, conv_size * weights_z);
     shape_v2mm_out.set(1, 1);
     shape_v2mm_out.set(2, 1);
-    const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position());
-    _v2mm_output.allocator()->init(info_v2mm_out);
+    _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
     _v2mm_kernel.set_target(gpu_target);
     _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
-    _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h);
+    _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dwc_output_shape));
+    _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
 
+    // Output staged configuration
+    if(_is_quantized)
+    {
+        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+        int   output_multiplier, output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
+        _output_reshaped.allocator()->allocate();
+    }
+
+    // Fill borders on inputs
+    PixelValue zero_in(static_cast<int32_t>(0));
+    PixelValue zero_w(static_cast<int32_t>(0));
+    if(_is_quantized)
+    {
+        zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
+        zero_w  = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
+    }
     BorderSize border_size = _v2mm_kernel.border_size();
-    _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0));
+    _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
 
     border_size.bottom = 0;
-    _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0));
+    _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
 
     // Allocate intermediate tensors
     _input_reshaped.allocator()->allocate();
@@ -135,4 +163,9 @@
     CLScheduler::get().enqueue(_v2mm_kernel);
 
     CLScheduler::get().enqueue(_vector_to_tensor_kernel);
+
+    if(_is_quantized)
+    {
+        CLScheduler::get().enqueue(_output_stage_kernel);
+    }
 }

diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
index 3b182d3..45f70d2 100644
--- a/src/runtime/CL/functions/CLEqualizeHistogram.cpp
+++ b/src/runtime/CL/functions/CLEqualizeHistogram.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,7 +68,7 @@
     }
     else
     {
-        const float diff = image_size - num_lowest_pixels;
+        const float diff = image_size - 1;
 
         for(size_t i = 0; i < 256; ++i)
         {

diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
index 7a0dd09..d6cda91 100644
--- a/src/runtime/CL/functions/CLFastCorners.cpp
+++ b/src/runtime/CL/functions/CLFastCorners.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,7 +52,7 @@
 {
 }
 
-void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonmax_suppression, CLKeyPointArray *const corners,
+void CLFastCorners::configure(const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners,
                               unsigned int *num_corners, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
@@ -76,7 +76,7 @@
 
     if(!_non_max)
     {
-        _copy_array_kernel.configure(&_output, update_number, corners, &_num_buffer);
+        _copy_array_kernel.configure(&_output, update_number, _corners, &_num_buffer);
     }
     else
     {
@@ -84,7 +84,7 @@
         _memory_group.manage(&_suppr);
 
         _suppr_func.configure(&_output, &_suppr, border_mode);
-        _copy_array_kernel.configure(&_suppr, update_number, corners, &_num_buffer);
+        _copy_array_kernel.configure(&_suppr, update_number, _corners, &_num_buffer);
 
         _suppr.allocator()->allocate();
     }

diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 68c6576..2b4670b 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,7 +55,7 @@
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&input, &weights, &output, 1.f, is_interleaved_transposed, gpu_target));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&input, &weights, &output, 1.f, is_interleaved_transposed, GEMMReshapeInfo(), gpu_target));
     }
 
     return Status{};
@@ -114,7 +114,7 @@
     // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
 
     // Initialize output tensor for im2col
-    TensorShape shape_im2col = compute_im2col_shape(*input->info());
+    TensorShape shape_im2col = compute_im2col_shape(input->info());
     _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
 
     // Configure im2col kernel
@@ -243,7 +243,7 @@
     bool            is_quantized     = is_data_type_quantized_asymmetric(input->data_type());
     const GPUTarget gpu_target       = CLScheduler::get().target();
 
-    const ITensorInfo &im2col_input     = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_shape(*input)));
+    const ITensorInfo &im2col_input     = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_shape(input)));
     const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
     const ITensorInfo &gemmlowp_output  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
 

diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index c676a10..6b5cd2d 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp

@@ -38,6 +38,54 @@
 
 using namespace arm_compute;
 
+namespace
+{
+inline bool is_interleaved_transposed(int m, int n, int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+{
+    bool flag = true;
+
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        if(k > 256 && m > 4 && data_type == DataType::F32 && reshape_b_only_on_first_run)
+        {
+            const float scale = k < 1024 ? 2.0f : 2.5f;
+            flag              = (scale * n) > ((1.66f * n) + 38.4f);
+        }
+        else
+        {
+            flag = false;
+        }
+    }
+
+    return flag;
+}
+
+Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const ICLTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info = GEMMInfo())
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+    if(c != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info());
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(c->info()->dimension(0) != output->dimension(0), "The C matrix must have the same number of rows as the output matrix");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(c->info()->dimension(1) != output->dimension(1), "The C matrix must have the same number of columns as the output matrix");
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(beta);
+    return Status{};
+}
+} // namespace
+
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
       _is_first_run(true), _reshape_b_only_on_first_run(false)
@@ -46,25 +94,10 @@
 
 void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
-    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
-    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
-    if(c != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
-        ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
-        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
-        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
-    }
-
-    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-
-    // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
-    // For Bifrost architectures we do not reshape the input matrices
-    _is_interleaved_transposed = (a->info()->dimension(1) > 16 && CLScheduler::get().target() != GPUTarget::BIFROST);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(a->info(), b->info(), c, output->info(), alpha, beta, gemm_info));
 
     // Check if we need to reshape the matrix B only on the first run
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
@@ -72,28 +105,50 @@
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
 
-    // Set the target for the matrix multiply kernel
-    _mm_kernel.set_target(CLScheduler::get().target());
+    // Get the GPU target
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    // Set the target for the kernels
+    _interleave_kernel.set_target(gpu_target);
+    _mm_kernel.set_target(gpu_target);
+
+    // Arguments used by GEMMReshapeInfo
+    // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
+    // in order to know how the matrices have been reshaped
+    const int m                         = a->info()->dimension(1);
+    const int n                         = b->info()->dimension(0);
+    const int k                         = a->info()->dimension(0);
+    int       mult_transpose1xW_width   = 1;
+    int       mult_interleave4x4_height = 1;
+
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        mult_transpose1xW_width   = 4;
+        mult_interleave4x4_height = 2;
+    }
+
+    // Check if we need to reshape the matrix A and matrix B
+    _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
 
     if(_is_interleaved_transposed)
     {
         matrix_a = &_tmp_a;
         matrix_b = &_tmp_b;
 
-        // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
-
-        // Configure interleave kernel
-        _interleave_kernel.configure(a, &_tmp_a);
-
-        // Configure transpose kernel
-        _transpose_kernel.configure(b, &_tmp_b);
-
         // Manage intermediate buffers
         _memory_group.manage(&_tmp_a);
         _memory_group.manage(&_tmp_b);
+
+        // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
+
+        // Configure interleave kernel
+        _interleave_kernel.configure(a, &_tmp_a, mult_interleave4x4_height);
+
+        // Configure transpose kernel
+        _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
     }
 
-    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed);
+    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height));
 
     if(_is_interleaved_transposed)
     {
@@ -110,6 +165,12 @@
     }
 }
 
+Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ICLTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(a, b, c, output, alpha, beta, gemm_info));
+    return Status{};
+}
+
 void CLGEMM::run()
 {
     _memory_group.acquire();

diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
new file mode 100644
index 0000000..c58af36
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp

@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped()
+{
+}
+
+void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output)
+{
+    // Perform validation step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayerReshapeWeights::validate(weights->info(),
+                                                                          (biases != nullptr) ? biases->info() : nullptr,
+                                                                          output->info()));
+
+    const bool       append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
+    const ICLTensor *biases_to_use = (append_biases) ? biases : nullptr;
+
+    _weights_reshape_kernel.configure(weights, biases_to_use, output);
+
+    output->info()->set_quantization_info(weights->info()->quantization_info());
+}
+
+Status CLConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(weights->data_type()));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
+
+        CLWeightsReshapeKernel::validate(weights, biases, output);
+    }
+
+    return Status{};
+}
+
+void CLConvolutionLayerReshapeWeights::run()
+{
+    _memory_group.acquire();
+
+    CLScheduler::get().enqueue(_weights_reshape_kernel);
+
+    _memory_group.release();
+}
+
+CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _im2col_output(),
+      _interleave_output(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_first_run(true)
+{
+}
+
+void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info()));
+
+    if(_is_quantized)
+    {
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate input and weights offset
+        const QuantizationInfo input_quantization_info   = input->info()->quantization_info();
+        const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
+
+        input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
+        weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+
+        _mm_gemmlowp.configure(input, weights, output, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+
+        // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
+        input->info()->set_quantization_info(input_quantization_info);
+        weights->info()->set_quantization_info(weights_quantization_info);
+    }
+    else
+    {
+        // Configure matrix multiply function
+        _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+    }
+}
+
+Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output)
+{
+    const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+
+    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */);
+    if(is_quantized)
+    {
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate input and weights offset
+        const QuantizationInfo input_quantization_info   = input->quantization_info();
+        const QuantizationInfo weights_quantization_info = weights->quantization_info();
+
+        std::unique_ptr<ITensorInfo> input_qa   = input->clone();
+        std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
+        input_qa->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
+        weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+
+        // Perform validation step on GEMMLowp
+        CLGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), output, gemm_info);
+    }
+    else
+    {
+        // Perform validation step on Matrix multiply function
+        CLGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
+    }
+    return Status{};
+}
+
+void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMConvolutionLayer::validate(input->info(),
+                                                                weights->info(),
+                                                                biases != nullptr ? biases->info() : nullptr,
+                                                                output->info(),
+                                                                conv_info,
+                                                                weights_info));
+
+    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+
+    const DataType dt = input->info()->data_type();
+
+    // Set the GPU target for im2col and col2im
+    _im2col_kernel.set_target(CLScheduler::get().target());
+    _col2im_kernel.set_target(CLScheduler::get().target());
+
+    const bool append_bias = (biases != nullptr) && (!_is_quantized);
+
+    const unsigned   bias_element  = (append_bias) ? 1 : 0;
+    const ICLTensor *biases_to_use = (append_bias) ? biases : nullptr;
+
+    // Get parameters from conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+
+    const unsigned int kernel_width  = weights->info()->dimension(0);
+    const unsigned int kernel_height = weights->info()->dimension(1);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
+
+    unsigned int mat_weights_cols = weights->info()->dimension(3);
+    unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
+
+    // _weights_reshaped will be auto configured in the kernel.
+    // Just append biases and do not transpose 1xW as it will be reshaped in CLGEMM
+    _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped);
+
+    weights = &_weights_reshaped;
+
+    // Create tensor to store im2col reshaped inputs
+    const unsigned int mat_input_cols = mat_weights_rows;
+    const unsigned int mat_input_rows = conv_w * conv_h;
+    TensorShape        shape_im2col   = input->info()->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+    TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->info()->fixed_point_position());
+    im2col_reshaped_info.set_quantization_info(input->info()->quantization_info());
+    _im2col_output.allocator()->init(im2col_reshaped_info);
+    _memory_group.manage(&_im2col_output);
+
+    // Create GEMM output tensor
+    TensorShape shape_gemm = _im2col_output.info()->tensor_shape();
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    const DataType gemm_data_type = _is_quantized ? DataType::S32 : dt;
+    // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
+    TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
+    info_gemm.set_quantization_info(output->info()->quantization_info());
+    _gemm_output.allocator()->init(info_gemm);
+    _memory_group.manage(&_gemm_output);
+
+    // Configure im2col
+    _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias);
+
+    // Configure GEMM
+    configure_mm(&_im2col_output, weights, &_gemm_output);
+
+    _im2col_output.allocator()->allocate();
+
+    // Configure output stage for quantized case
+    if(_is_quantized)
+    {
+        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+        int   output_multiplier, output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        _memory_group.manage(&_tmp_output);
+        _gemmlowp_output_stage.configure(&_gemm_output, biases, &_tmp_output, output_multiplier, output_shift, output_quant_info.offset);
+    }
+
+    // Configure Col2Im
+    _col2im_kernel.configure(_is_quantized ? &_tmp_output : &_gemm_output, output, std::make_pair(conv_w, conv_h));
+    if(_is_quantized)
+    {
+        _tmp_output.allocator()->allocate();
+    }
+    _gemm_output.allocator()->allocate();
+
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+
+    // Allocate intermediate tensor
+    _weights_reshaped.allocator()->allocate();
+
+    ARM_COMPUTE_UNUSED(weights_info);
+}
+
+Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                        const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+    const bool     is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+    const bool     append_bias  = (biases != nullptr) && (!is_quantized);
+    const unsigned bias_element = (append_bias) ? 1 : 0;
+    const DataType dt           = input->data_type();
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+
+    const unsigned int kernel_width  = weights->dimension(0);
+    const unsigned int kernel_height = weights->dimension(1);
+
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height, conv_info);
+
+    unsigned int mat_weights_cols = weights->dimension(3);
+    unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + bias_element;
+
+    CLConvolutionLayerReshapeWeights::validate(weights, biases, nullptr);
+
+    // Create tensor info for im2col reshaped inputs
+    const unsigned int mat_input_cols = mat_weights_rows;
+    const unsigned int mat_input_rows = conv_w * conv_h;
+    TensorShape        shape_im2col   = input->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+    TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->fixed_point_position());
+    im2col_reshaped_info.set_quantization_info(input->quantization_info());
+    CLIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias);
+
+    // Create GEMM output tensor
+    TensorShape shape_gemm = im2col_reshaped_info.tensor_shape();
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    const DataType gemm_data_type = is_quantized ? DataType::S32 : dt;
+    // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
+    TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->fixed_point_position());
+    info_gemm.set_quantization_info(output->quantization_info());
+
+    validate_mm(&im2col_reshaped_info, weights, &info_gemm);
+
+    TensorInfo tmp_info(input->tensor_shape(), 1, DataType::QASYMM8, input->fixed_point_position());
+    if(is_quantized)
+    {
+        float multiplier = input->quantization_info().scale * weights->quantization_info().scale / output->quantization_info().scale;
+        int   output_multiplier, output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        // Validate output stage for quantized case
+        CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&info_gemm, biases, &tmp_info, output->quantization_info().offset);
+    }
+
+    // Validate Col2Im
+    CLCol2ImKernel::validate(is_quantized ? &tmp_info : &info_gemm, output, std::make_pair(conv_w, conv_h));
+
+    if(biases != nullptr)
+    {
+        if(is_quantized)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    return Status{};
+}
+
+void CLGEMMConvolutionLayer::run()
+{
+    // Run weights reshaping (Runs once for every configure)
+    if(_is_first_run)
+    {
+        _reshape_weights.run();
+
+        _is_first_run = false;
+    }
+
+    _memory_group.acquire();
+
+    // Run im2col
+    CLScheduler::get().enqueue(_im2col_kernel);
+
+    // Runs CLGEMM or CLGEMMLowpMatrixMultiplyCore functions
+    if(_is_quantized)
+    {
+        // Run gemmlowp
+        _mm_gemmlowp.run();
+
+        // Run output stage
+        _gemmlowp_output_stage.run();
+    }
+    else
+    {
+        // Run gemm
+        _mm_gemm.run();
+    }
+
+    // Reshape output matrix
+    CLScheduler::get().enqueue(_col2im_kernel, false);
+
+    _memory_group.release();
+}

diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index ddcab6a..c688299 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,6 +35,29 @@
 using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
 
+namespace
+{
+inline bool is_interleaved_transposed(int m, int n, int k, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+{
+    bool flag = true;
+
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        // COMPMID-852
+        if(k > 256 && m > 4 && reshape_b_only_on_first_run)
+        {
+            flag = ((0.72f + n * 0.10766f) < (n * 0.1284f));
+        }
+        else
+        {
+            flag = false;
+        }
+    }
+
+    return flag;
+}
+} // namespace
+
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
       _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true), _is_first_run(true), _reshape_b_only_on_first_run(false)
@@ -51,33 +74,45 @@
     _a_offset                    = a->info()->quantization_info().offset;
     _b_offset                    = b->info()->quantization_info().offset;
 
-    // If the input tensor has less than 16 rows, we run a special version of GEMMLowp without reshaping the input tensors
-    _is_interleaved_transposed = a->info()->dimension(1) > 16;
+    // Get the GPU target
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    // Set the target for the kernels
+    _mtx_a_reshape_kernel.set_target(gpu_target);
+    _mm_kernel.set_target(gpu_target);
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
 
+    // Arguments used by GEMMReshapeInfo
+    // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
+    // in order to know how the matrices have been reshaped
+    const int     m                         = a->info()->dimension(1);
+    const int     n                         = b->info()->dimension(0);
+    const int     k                         = a->info()->dimension(0);
+    constexpr int mult_transpose1xW_width   = 1;
+    constexpr int mult_interleave4x4_height = 1;
+
+    // Check if we need to reshape the matrix A and matrix B
+    _is_interleaved_transposed = is_interleaved_transposed(m, n, k, _reshape_b_only_on_first_run, gpu_target);
+
     if(_is_interleaved_transposed)
     {
         matrix_a = &_tmp_a;
         matrix_b = &_tmp_b;
 
-        TensorInfo info_a(compute_interleaved_shape(*a->info()), 1, a->info()->data_type());
-        TensorInfo info_b(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type());
-        _tmp_a.allocator()->init(info_a);
-        _tmp_b.allocator()->init(info_b);
         _memory_group.manage(&_tmp_a);
         _memory_group.manage(&_tmp_b);
 
         // Configure interleave kernel
-        _mtx_a_reshape_kernel.configure(a, &_tmp_a);
+        _mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height);
 
         // Configure transpose kernel
-        _mtx_b_reshape_kernel.configure(b, &_tmp_b);
+        _mtx_b_reshape_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
     }
 
     // Configure matrix multiply kernel
-    _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed);
+    _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height));
 
     // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
     if(_a_offset != 0)
@@ -136,22 +171,30 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
 
-    int32_t a_offset                  = a->quantization_info().offset;
-    int32_t b_offset                  = b->quantization_info().offset;
-    bool    is_interleaved_transposed = a->dimension(1) > 16;
+    int32_t a_offset = a->quantization_info().offset;
+    int32_t b_offset = b->quantization_info().offset;
 
-    if(is_interleaved_transposed)
+    const int             m                         = a->dimension(1);
+    const int             n                         = b->dimension(0);
+    const int             k                         = a->dimension(0);
+    constexpr int         mult_transpose1xW_width   = 1;
+    constexpr int         mult_interleave4x4_height = 1;
+    const GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height);
+
+    bool reshape_matrices = is_interleaved_transposed(m, n, k, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
+
+    if(reshape_matrices)
     {
-        TensorInfo info_a(compute_interleaved_shape(*a), 1, a->data_type());
-        TensorInfo info_b(compute_transpose1xW_shape(*b), 1, b->data_type());
+        TensorInfo info_a(compute_interleaved_shape(*a, mult_interleave4x4_height), 1, a->data_type());
+        TensorInfo info_b(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width), 1, b->data_type());
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &info_b));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a, mult_interleave4x4_height));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &info_b, mult_transpose1xW_width));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output, reshape_matrices, reshape_info));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(a, b, output, reshape_matrices, reshape_info));
     }
 
     TensorInfo info_vector_sum_col, info_vector_sum_row;

diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
index 678848b..911c9b3 100644
--- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
+++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,7 +42,7 @@
 {
 }
 
-void CLLaplacianReconstruct::configure(const CLPyramid *pyramid, const ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
+void CLLaplacianReconstruct::configure(const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
     ARM_COMPUTE_ERROR_ON(input == output);

diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index a89a45a..9120aad 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,6 +46,7 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(!conv_info.padding_is_symmetric());
 
     if(biases != nullptr)
     {

diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index f23e231..146856c 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
+#include "arm_compute/core/Error.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
@@ -34,4 +35,10 @@
     auto k = arm_compute::support::cpp14::make_unique<CLPermuteKernel>();
     k->configure(input, output, perm);
     _kernel = std::move(k);
+}
+
+Status CLPermute::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(CLPermuteKernel::validate(input, output, perm));
+    return Status{};
 }
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index c78f944..b4c20db 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
 
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
 #include "support/ToolchainSupport.h"
 
@@ -30,16 +31,26 @@
 
 using namespace arm_compute;
 
-void CLPixelWiseMultiplication::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
                                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLPixelWiseMultiplicationKernel>();
     k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
     _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
 }
 
 Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
                                            ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
     return CLPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
-}
\ No newline at end of file
+}

diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 7c96111..a92fbce 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,7 @@
 using namespace arm_compute;
 
 CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _max_shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp(), _run_legacy_path(false)
+    : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
 {
 }
 
@@ -65,16 +65,7 @@
     _memory_group.manage(&_sum);
 
     // Configure kernels
-    _run_legacy_path = is_data_type_quantized_asymmetric(input->info()->data_type());
-    if(_run_legacy_path)
-    {
-        _max_kernel.configure(input, &_max);
-        _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
-    }
-    else
-    {
-        _max_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
-    }
+    _max_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
     _norm_kernel.configure(&_tmp, &_sum, output, beta);
 
     // Allocate intermediate buffers
@@ -96,16 +87,7 @@
     TensorInfo tensor_info_max(input->clone()->set_tensor_shape(max_sum_shape));
     TensorInfo tensor_info_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()));
 
-    bool run_legacy_path = is_data_type_quantized_asymmetric(input->data_type());
-    if(run_legacy_path)
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DMaxKernel::validate(input, &tensor_info_max));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DShiftExpSumKernel::validate(input, &tensor_info_max, &tensor_info_tmp, &tensor_info_sum));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DMaxShiftExpSumKernel::validate(input, &tensor_info_max, &tensor_info_tmp, &tensor_info_sum));
-    }
+    ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DMaxShiftExpSumKernel::validate(input, &tensor_info_max, &tensor_info_tmp, &tensor_info_sum));
     ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DNormKernel::validate(&tensor_info_tmp, &tensor_info_sum, output));
 
     return Status{};
@@ -115,16 +97,7 @@
 {
     _memory_group.acquire();
 
-    // Force to use the new fused kernel
-    if(_run_legacy_path)
-    {
-        CLScheduler::get().enqueue(_max_kernel, false);
-        CLScheduler::get().enqueue(_shift_exp_sum_kernel, false);
-    }
-    else
-    {
-        CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false);
-    }
+    CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false);
     CLScheduler::get().enqueue(_norm_kernel);
 
     _memory_group.release();

diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index a83a0bc..4e4dd87 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -156,7 +156,7 @@
 }
 
 CPPScheduler::CPPScheduler()
-    : _num_threads(std::thread::hardware_concurrency()),
+    : _num_threads(num_threads_hint()),
       _threads(_num_threads - 1)
 {
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp
index 99bdf43..cc5e8f4 100755
--- a/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,9 +37,10 @@
 {
 }
 
-void GCBatchNormalizationLayer::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma, float epsilon)
+void GCBatchNormalizationLayer::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma, float epsilon,
+                                          ActivationLayerInfo act_info)
 {
-    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
+    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
 void GCBatchNormalizationLayer::run()

diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index 5689722..1d2370e 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp

@@ -193,7 +193,6 @@
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
 
-    // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
     TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->info()->fixed_point_position());
     _input_im2col_reshaped.allocator()->init(im2col_reshaped_info);
 
@@ -204,7 +203,6 @@
         shape_interleaved.set(0, shape_interleaved.x() * 4);
         shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
 
-        // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
         TensorInfo interleaved_info(shape_interleaved, 1, dt, input->info()->fixed_point_position());
         _input_interleaved_reshaped.allocator()->init(interleaved_info);
     }
@@ -215,7 +213,6 @@
     shape_gemm.set(1, mat_input_rows);
     const DataType gemm_data_type = dt;
 
-    // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
     TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
     _gemm_output.allocator()->init(info_gemm);
 

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
index ef65989..9cba371 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,11 @@
 
 using namespace arm_compute;
 
+GCDepthwiseConvolutionLayer3x3::GCDepthwiseConvolutionLayer3x3()
+    : _kernel(nullptr), _border_handler(), _shift_handler()
+{
+}
+
 void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<GCDepthwiseConvolutionLayer3x3Kernel>();
@@ -38,4 +43,15 @@
 
     // Configure border handler
     _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+    _shift_handler.configure(input);
+}
+
+void GCDepthwiseConvolutionLayer3x3::run()
+{
+    GCScheduler::get().dispatch(_shift_handler, false);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(_border_handler, false);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(*_kernel);
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
index ae9dd51..a2607d4 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,12 +27,19 @@
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-void GCDirectConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
+GCDirectConvolutionLayer::GCDirectConvolutionLayer()
+    : _kernel(nullptr), _border_handler(), _shift_handler()
+{
+}
+
+void GCDirectConvolutionLayer::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
 {
     int kernel_size = weights->info()->dimension(0);
 
@@ -61,4 +68,15 @@
     }
 
     _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+    _shift_handler.configure(input);
+}
+
+void GCDirectConvolutionLayer::run()
+{
+    GCScheduler::get().dispatch(_shift_handler, false);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(_border_handler, false);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(*_kernel);
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
index 7aa2d42..5122c20 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,10 +45,14 @@
 {
 }
 
-void GCGEMM::configure(const IGCTensor *a, const IGCTensor *b, const IGCTensor *c, IGCTensor *output, float alpha, float beta)
+void GCGEMM::configure(const IGCTensor *a, const IGCTensor *b, const IGCTensor *c, IGCTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.reshape_b_only_on_first_run(), "Reshape matrix B only on first run is not supported");
+    ARM_COMPUTE_UNUSED(gemm_info);
 
     if(c != nullptr)
     {

diff --git a/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
index ff03eff..dcbb39d 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,10 +25,17 @@
 
 #include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
+GCPoolingLayer::GCPoolingLayer()
+    : _kernel(nullptr), _border_handler(), _shift_handler()
+{
+}
+
 void GCPoolingLayer::configure(IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info)
 {
     // Configure pooling kernel
@@ -39,9 +46,20 @@
     // Configure border depending on operation required
     BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0.0f));
+
+    _shift_handler.configure(input);
 }
 
 Status GCPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
 {
     return GCPoolingLayerKernel::validate(input, output, pool_info);
-}
\ No newline at end of file
+}
+
+void GCPoolingLayer::run()
+{
+    GCScheduler::get().dispatch(_shift_handler, false);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(_border_handler, false);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(*_kernel);
+}

diff --git a/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp b/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp
new file mode 100644
index 0000000..93496f4
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp

@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h"
+
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void GCTensorShift::configure(IGCTensor *input)
+{
+    auto k = arm_compute::support::cpp14::make_unique<GCTensorShiftKernel>();
+    k->configure(input);
+    _kernel = std::move(k);
+}

diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index 0254181..583cb40 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,18 +27,73 @@
 #include <cstdlib>
 #include <cstring>
 #include <fcntl.h>
+#include <fstream>
+#include <map>
 #include <sched.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 
+#ifndef BARE_METAL
+#include <regex>
+#include <thread>
+#endif /* BARE_METAL */
+
 namespace
 {
+unsigned int get_threads_hint()
+{
+    unsigned int num_threads_hint = 1;
+
+#ifndef BARE_METAL
+    std::map<std::string, unsigned int> cpu_part_occurrence_map;
+
+    // CPU part regex
+    std::regex  cpu_part_rgx(R"(.*CPU part.+?(?=:).+?(?=\w+)(\w+).*)");
+    std::smatch cpu_part_match;
+
+    // Read cpuinfo and get occurrence of each core
+    std::ifstream cpuinfo;
+    cpuinfo.open("/proc/cpuinfo", std::ios::in);
+    if(cpuinfo.is_open())
+    {
+        std::string line;
+        while(bool(getline(cpuinfo, line)))
+        {
+            if(std::regex_search(line.cbegin(), line.cend(), cpu_part_match, cpu_part_rgx))
+            {
+                std::string cpu_part = cpu_part_match[1];
+                if(cpu_part_occurrence_map.find(cpu_part) != cpu_part_occurrence_map.end())
+                {
+                    cpu_part_occurrence_map[cpu_part]++;
+                }
+                else
+                {
+                    cpu_part_occurrence_map[cpu_part] = 1;
+                }
+            }
+        }
+    }
+
+    // Get min number of threads
+    auto min_common_cores = std::min_element(cpu_part_occurrence_map.begin(), cpu_part_occurrence_map.end(),
+                                             [](const std::pair<std::string, unsigned int> &p1, const std::pair<std::string, unsigned int> &p2)
+    {
+        return p1.second < p2.second;
+    });
+
+    // Set thread hint
+    num_threads_hint = cpu_part_occurrence_map.empty() ? std::thread::hardware_concurrency() : min_common_cores->second;
+#endif /* BARE_METAL */
+
+    return num_threads_hint;
+}
+
 unsigned int get_cpu_impl()
 {
 #ifndef BARE_METAL
     int fd = open("/proc/cpuinfo", 0); // NOLINT
-    std::array<char, 1200> buff{ {} };
+    std::array<char, 3000> buff{ {} };
     char *pos     = nullptr;
     char *end     = nullptr;
     bool  foundid = false;
@@ -50,7 +105,7 @@
         return 0;
     }
 
-    int charsread = read(fd, buff.data(), 1200);
+    int charsread = read(fd, buff.data(), 3000);
     pos           = buff.data();
     end           = buff.data() + charsread;
 
@@ -129,6 +184,10 @@
 {
 IScheduler::IScheduler()
 {
+    // Work out the best possible number of execution threads
+    _num_threads_hint = get_threads_hint();
+
+    // Work out the CPU implementation
     switch(get_cpu_impl())
     {
         case 0xd0f:
@@ -161,4 +220,9 @@
 {
     return _info;
 }
+
+unsigned int IScheduler::num_threads_hint() const
+{
+    return _num_threads_hint;
+}
 } // namespace arm_compute

diff --git a/src/runtime/MultiImage.cpp b/src/runtime/MultiImage.cpp
index def1487..6eba71b 100644
--- a/src/runtime/MultiImage.cpp
+++ b/src/runtime/MultiImage.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 
 using namespace arm_compute;
@@ -51,7 +52,8 @@
 
 void MultiImage::internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding)
 {
-    TensorInfo info(width, height, Format::U8);
+    TensorShape shape = adjust_odd_shape(TensorShape{ width, height }, format);
+    TensorInfo  info(shape, Format::U8);
 
     if(auto_padding)
     {
@@ -72,7 +74,7 @@
         case Format::YUYV422:
         case Format::UYVY422:
         {
-            TensorInfo info_full(width, height, format);
+            TensorInfo info_full(shape, format);
 
             if(auto_padding)
             {
@@ -85,7 +87,8 @@
         case Format::NV12:
         case Format::NV21:
         {
-            TensorInfo info_uv88(width / 2, height / 2, Format::UV88);
+            const TensorShape shape_uv88 = calculate_subsampled_shape(shape, Format::UV88);
+            TensorInfo        info_uv88(shape_uv88, Format::UV88);
 
             if(auto_padding)
             {
@@ -98,7 +101,8 @@
         }
         case Format::IYUV:
         {
-            TensorInfo info_sub2(width / 2, height / 2, Format::U8);
+            const TensorShape shape_sub2 = calculate_subsampled_shape(shape, Format::IYUV);
+            TensorInfo        info_sub2(shape_sub2, Format::U8);
 
             if(auto_padding)
             {
@@ -120,7 +124,7 @@
             break;
     }
 
-    _info.init(width, height, format);
+    _info.init(shape.x(), shape.y(), format);
 }
 
 void MultiImage::allocate()

diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index cdf1b54..6af71a3 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index b5dd4d0..7d8e3cf 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
 #include "support/ToolchainSupport.h"
 
@@ -30,11 +31,21 @@
 
 using namespace arm_compute;
 
-void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
+void NEArithmeticAddition::configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEArithmeticAdditionKernel>();
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
 }
 Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
 {

diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index f6be001..bb224db 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,16 +37,18 @@
 {
 }
 
-void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
+void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon,
+                                          ActivationLayerInfo act_info)
 {
     // Configure kernel
-    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
+    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
 Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma,
-                                           float epsilon)
+                                           float epsilon, ActivationLayerInfo act_info)
 {
-    return NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon);
+    ARM_COMPUTE_RETURN_ON_ERROR(NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info));
+    return Status{};
 }
 
 void NEBatchNormalizationLayer::run()

diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 8f7d940..0a49158 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,518 +23,96 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
 
-#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
 #include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "support/ToolchainSupport.h"
 
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
-} // namespace arm_compute
-
 #include <cmath>
 #include <tuple>
 
 namespace arm_compute
 {
-namespace
-{
-TensorShape get_reshaped_weights_shape(const ITensorInfo *weights, bool has_bias)
-{
-    const unsigned int mat_weights_cols = weights->dimension(3);
-    const unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (has_bias ? 1 : 0);
-    return TensorShape(mat_weights_cols, mat_weights_rows);
-}
-} // namespace
-
-NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
-{
-}
-
-void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW)
-{
-    // Perform validation step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayerReshapeWeights::validate(weights->info(),
-                                                                          (biases != nullptr) ? biases->info() : nullptr,
-                                                                          output->info(),
-                                                                          transpose1xW));
-
-    // Check if bias are present, if yes they will be embedded to the weights matrix
-    const bool _has_bias = (biases != nullptr);
-
-    _transpose1xW = transpose1xW;
-
-    if(transpose1xW)
-    {
-        // Create tensor to store the reshaped weights
-        TensorInfo info_wr = weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(get_reshaped_weights_shape(weights->info(), _has_bias));
-
-        _weights_reshaped.allocator()->init(info_wr);
-        _memory_group.manage(&_weights_reshaped);
-
-        _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
-        _weights_transposed_kernel.configure(&_weights_reshaped, output);
-
-        _weights_reshaped.allocator()->allocate();
-    }
-    else
-    {
-        _weights_reshape_kernel.configure(weights, biases, output);
-    }
-}
-
-Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose1xW)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    // Check if bias are present, if yes they will be embedded to the weights matrix
-    const bool has_bias = (biases != nullptr);
-
-    // Checks performed when biases are present
-    if(has_bias)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    if(transpose1xW)
-    {
-        TensorInfo weights_reshaped = weights->clone()->set_tensor_shape(get_reshaped_weights_shape(weights, has_bias));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(&weights_reshaped, output));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, output));
-    }
-
-    return Status{};
-}
-
-void NEConvolutionLayerReshapeWeights::run()
-{
-    _memory_group.acquire();
-
-    NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
-
-    if(_transpose1xW)
-    {
-        NEScheduler::get().schedule(&_weights_transposed_kernel, Window::DimY);
-    }
-
-    _memory_group.release();
-}
-
-namespace
-{
-TensorShape get_reshaped_weights_shape_conv(const ITensorInfo *weights, bool has_bias, bool is_fully_connected_convolution)
-{
-    unsigned int mat_weights_cols = weights->dimension(3);
-    unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (has_bias ? 1 : 0);
-
-    if(is_fully_connected_convolution)
-    {
-        // Create tensor to store the reshaped weights
-        return TensorShape(mat_weights_cols, mat_weights_rows);
-    }
-    else
-    {
-        // Create tensor to store transposed weights
-        const float transpose_width = 16.0f / weights->element_size();
-        return TensorShape(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
-    }
-}
-
-Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, DataType &dt,
-                                      bool &has_bias,
-                                      bool &are_weights_reshaped, unsigned int &kernel_width, unsigned int &kernel_height, bool &is_fully_connected_convolution, unsigned int &mat_weights_cols, unsigned int &mat_weights_rows,
-                                      unsigned int &conv_w, unsigned int &conv_h)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && weights->dimension(2) != input->dimension(2));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && biases->dimension(0) != weights->dimension(3));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    dt                   = input->data_type();
-    has_bias             = (biases != nullptr);
-    are_weights_reshaped = weights_info.are_reshaped();
-    kernel_width         = (are_weights_reshaped) ? weights_info.kernel_size().first : weights->dimension(0);
-    kernel_height        = (are_weights_reshaped) ? weights_info.kernel_size().second : weights->dimension(1);
-    mat_weights_cols     = weights->dimension(3);
-    mat_weights_rows     = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (has_bias ? 1 : 0);
-
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
-                                                 conv_info);
-
-    is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
-
-    return Status{};
-}
-} // namespace
-
 NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_optimised_kernel(nullptr), _output_col2im_kernel(),
-      _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _workspace(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+    : _memory_manager(std::move(memory_manager)), _function()
 {
 }
 
-void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info));
 
-    DataType     dt{};
-    unsigned int kernel_width     = 0;
-    unsigned int kernel_height    = 0;
-    unsigned int mat_weights_cols = 0;
-    unsigned int mat_weights_rows = 0;
-    unsigned int conv_w           = 0;
-    unsigned int conv_h           = 0;
-
-    Status status = validate_and_initialize_values(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), conv_info, weights_info, dt, _has_bias, _are_weights_reshaped,
-                                                   kernel_width, kernel_height,
-                                                   _is_fully_connected_convolution,
-                                                   mat_weights_cols, mat_weights_rows, conv_w, conv_h);
-
-    ARM_COMPUTE_ERROR_THROW_ON(status);
-
-    const unsigned int fixed_point_position = input->info()->fixed_point_position();
-
-#if defined(__arm__)
-    if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
+    switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
+                                                      weights_info))
     {
-        _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
-    }
-#elif defined(__aarch64__)
-    if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
-    {
-        _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
-    }
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
-    // Reshape weights if needed
-    if(_mm_optimised_kernel != nullptr)
-    {
-        if(_are_weights_reshaped)
+        case ConvolutionMethod::WINOGRAD:
         {
-            mat_weights_cols = weights_info.num_kernels();
-            mat_weights_rows = weights->info()->dimension(1);
+            auto f = arm_compute::support::cpp14::make_unique<NEWinogradLayer>(_memory_manager);
+            f->configure(input, weights, biases, output, conv_info);
+            _function = std::move(f);
+            break;
         }
-        else
+        case ConvolutionMethod::GEMM:
         {
-            TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
-
-            // Create tensor to store the reshaped weights
-            _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
-            _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
-            weights = &_weights_reshaped;
+            auto f = arm_compute::support::cpp14::make_unique<NEGEMMConvolutionLayer>(_memory_manager);
+            f->configure(input, weights, biases, output, conv_info, weights_info);
+            _function = std::move(f);
+            break;
         }
-    }
-    else
-    {
-        if(_are_weights_reshaped)
+        case ConvolutionMethod::DIRECT:
         {
-            if(_is_fully_connected_convolution)
-            {
-                mat_weights_cols = weights_info.num_kernels();
-                mat_weights_rows = weights->info()->dimension(1);
-            }
-            else
-            {
-                const unsigned int transpose_width = 16 / input->info()->element_size();
-                mat_weights_cols                   = weights_info.num_kernels();
-                mat_weights_rows                   = weights->info()->dimension(0) / transpose_width + (_has_bias ? 1 : 0);
-            }
+            auto f = arm_compute::support::cpp14::make_unique<NEDirectConvolutionLayer>(_memory_manager);
+            f->configure(input, weights, biases, output, conv_info);
+            _function = std::move(f);
+            break;
         }
-        else
-        {
-            TensorShape reshaped_weights_shape;
-
-            if(_is_fully_connected_convolution)
-            {
-                reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
-            }
-            else
-            {
-                // Create tensor to store transposed weights
-                const float transpose_width = 16.0f / input->info()->element_size();
-                reshaped_weights_shape      = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
-                                                           static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
-            }
-
-            // Create tensor to store the reshaped weights
-            _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
-            _reshape_weights.configure(weights, biases, &_weights_reshaped, !_is_fully_connected_convolution /* 1xW transpose */);
-            weights = &_weights_reshaped;
-        }
-    }
-
-    // Create tensor to store im2col reshaped inputs
-    const unsigned int mat_input_cols = mat_weights_rows;
-    const unsigned int mat_input_rows = conv_w * conv_h;
-
-    TensorShape shape_im2col(input->info()->tensor_shape());
-    shape_im2col.set(0, mat_input_cols);
-    shape_im2col.set(1, mat_input_rows);
-    shape_im2col.set(2, 1);
-    _input_im2col_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-    _memory_group.manage(&_input_im2col_reshaped);
-
-    // Create tensor (interleave) to prepare input tensor for GEMM
-    if(!_is_fully_connected_convolution && _mm_optimised_kernel == nullptr)
-    {
-        TensorShape shape_interleaved(shape_im2col);
-        shape_interleaved.set(0, shape_interleaved.x() * 4);
-        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
-        _input_interleaved_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_interleaved));
-        _memory_group.manage(&_input_interleaved_reshaped);
-    }
-
-    // Create GEMM output tensor
-    TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
-    shape_gemm.set(0, mat_weights_cols);
-    shape_gemm.set(1, mat_input_rows);
-    _gemm_output.allocator()->init(_input_im2col_reshaped.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_gemm));
-    _memory_group.manage(&_gemm_output);
-
-    // Configure kernels
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
-
-#if defined(__arm__) || defined(__aarch64__)
-    if(_mm_optimised_kernel != nullptr)
-    {
-        struct CPUInfo ci = NEScheduler::get().cpu_info();
-
-        const int M = _gemm_output.info()->tensor_shape().y();
-        const int N = _gemm_output.info()->tensor_shape().x();
-        const int K = _input_im2col_reshaped.info()->tensor_shape().x();
-
-#if defined(__arm__)
-        GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
-#elif defined(__aarch64__)
-        GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
-        constexpr size_t alignment = 4096;
-        _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
-        _memory_group.manage(&_workspace);
-
-        // Configure matrix multiplication kernel
-        _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
-
-        _workspace.allocator()->allocate();
-    }
-    else
-#endif /* defined(__arm__) || defined(__aarch64__) */
-    {
-        if(_is_fully_connected_convolution)
-        {
-            _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f);
-        }
-        else
-        {
-            _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
-            _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
-            _input_interleaved_reshaped.allocator()->allocate();
-        }
-    }
-
-    _input_im2col_reshaped.allocator()->allocate();
-    _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
-    _gemm_output.allocator()->allocate();
-
-    // Allocate intermediate tensor
-    if(!_are_weights_reshaped)
-    {
-        _weights_reshaped.allocator()->allocate();
+        default:
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
     }
 }
 
 Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
                                     const WeightsInfo &weights_info)
 {
-    DataType     dt{};
-    bool         has_bias{};
-    bool         are_weights_reshaped{};
-    bool         is_fully_connected_convolution{};
-    unsigned int kernel_width     = 0;
-    unsigned int kernel_height    = 0;
-    unsigned int mat_weights_cols = 0;
-    unsigned int mat_weights_rows = 0;
-    unsigned int conv_w           = 0;
-    unsigned int conv_h           = 0;
-
-    Status status = validate_and_initialize_values(input, weights, biases, conv_info, weights_info, dt, has_bias, are_weights_reshaped, kernel_width, kernel_height,
-                                                   is_fully_connected_convolution, mat_weights_cols, mat_weights_rows,
-                                                   conv_w, conv_h);
-
-    ARM_COMPUTE_RETURN_ON_ERROR(status);
-
-    std::unique_ptr<ITensorInfo> reshaped_weights = weights->clone();
-    bool                         optimised_kernel = false;
-
-#if defined(__arm__)
-    if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
+    switch(NEConvolutionLayer::get_convolution_method(input, weights, biases, output, conv_info, weights_info))
     {
-        optimised_kernel = true;
+        case ConvolutionMethod::WINOGRAD:
+            //Validate Winograd
+            NEWinogradLayer::validate(input, weights, biases, output, conv_info);
+            break;
+        case ConvolutionMethod::GEMM:
+            //Validate Gemm-based Convolution
+            NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info);
+            break;
+        case ConvolutionMethod::DIRECT:
+            //Validate Gemm-based Convolution
+            NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info);
+        default:
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
     }
-#elif defined(__aarch64__)
-    if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
-    {
-        optimised_kernel = true;
-    }
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
-    // Reshape weights if needed
-    if(optimised_kernel)
-    {
-        if(are_weights_reshaped)
-        {
-            mat_weights_cols = weights_info.num_kernels();
-            mat_weights_rows = weights->dimension(1);
-        }
-        else
-        {
-            TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
-
-            // Create tensor to store the reshaped weights
-            reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, has_bias, is_fully_connected_convolution));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
-            weights = reshaped_weights.get();
-        }
-    }
-    else
-    {
-        if(are_weights_reshaped)
-        {
-            const unsigned int transpose_width = 16 / input->element_size();
-            mat_weights_cols                   = weights_info.num_kernels();
-            mat_weights_rows                   = weights->dimension(0) / transpose_width + (has_bias ? 1 : 0);
-        }
-        else
-        {
-            TensorShape reshaped_weights_shape;
-
-            if(is_fully_connected_convolution)
-            {
-                reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
-            }
-            else
-            {
-                // Create tensor to store transposed weights
-                const float transpose_width = 16.0f / input->element_size();
-                reshaped_weights_shape      = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
-                                                           static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
-            }
-
-            // Create tensor to store the reshaped weights
-            reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, has_bias, is_fully_connected_convolution));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
-            weights = reshaped_weights.get();
-        }
-    }
-
-    // Validate im2col
-    const unsigned int mat_input_cols = mat_weights_rows;
-    const unsigned int mat_input_rows = conv_w * conv_h;
-    TensorShape        shape_im2col   = input->tensor_shape();
-    shape_im2col.set(0, mat_input_cols);
-    shape_im2col.set(1, mat_input_rows);
-    shape_im2col.set(2, 1);
-    TensorInfo im2_col_info = input->clone()->set_tensor_shape(shape_im2col);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2_col_info, Size2D(weights->dimension(0), weights->dimension(1)), conv_info, has_bias));
-
-    // Create GEMM output tensor
-    TensorShape shape_gemm(im2_col_info.tensor_shape());
-    shape_gemm.set(0, mat_weights_cols);
-    shape_gemm.set(1, mat_input_rows);
-    TensorInfo gemm_output_info = input->clone()->set_tensor_shape(shape_gemm);
-
-    // Validate GEMM interleave and multiply
-    if(!is_fully_connected_convolution)
-    {
-        TensorShape shape_interleaved = shape_im2col;
-        shape_interleaved.set(0, shape_interleaved.x() * 4);
-        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
-        TensorInfo input_interleaved_info = input->clone()->set_tensor_shape(shape_interleaved);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(&im2_col_info, &input_interleaved_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&input_interleaved_info, weights, &gemm_output_info));
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&im2_col_info, weights, &gemm_output_info));
-    }
-
-    ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != conv_w) || (output->dimension(1) != conv_h), "Output shape does not match the expected one");
 
     return Status{};
 }
 
+ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                                             const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_UNUSED(output);
+    ARM_COMPUTE_UNUSED(weights_info);
+    if((input->data_type() == DataType::F32) && (weights->dimension(0) == 3) && (weights->dimension(1) == 3) && (weights->num_dimensions() <= 4) && (conv_info.stride().first == 1)
+       && (conv_info.stride().second == 1) && (biases != nullptr))
+    {
+        return ConvolutionMethod::WINOGRAD;
+    }
+    return ConvolutionMethod::GEMM;
+}
+
 void NEConvolutionLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(!_are_weights_reshaped)
-    {
-        _are_weights_reshaped = true;
-        _reshape_weights.run();
-    }
-
-    _memory_group.acquire();
-
-    // Run input reshaping
-    NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
-
-    // Runs matrix multiply on reshaped matrices
-    if(_mm_optimised_kernel != nullptr)
-    {
-        NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
-    }
-    else
-    {
-        if(!_is_fully_connected_convolution)
-        {
-            // Run interleave
-            NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
-        }
-
-        NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
-    }
-
-    // Reshape output matrix
-    NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
-
-    _memory_group.release();
+    _function->run();
 }
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 61ffc77..c1ba5dd 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,7 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 1 && weights->info()->dimension(0) != 3 && weights->info()->dimension(0) != 5);
+    ARM_COMPUTE_ERROR_ON(!info.padding_is_symmetric());
 
     _input        = input;
     _info         = info;
@@ -76,6 +76,8 @@
     // setup the function to convolve the upscaled output
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
     _conv_f.configure(&_scaled_output, weights, bias, output, conv_info);
+
+    // Allocate auxiliary tensors
     _scaled_output.allocator()->allocate();
 }
 
@@ -113,6 +115,8 @@
         }
     }
 
+    // Run convolution layer
     _conv_f.run();
+
     _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 2d08b45..95fcf88 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp

@@ -26,14 +26,18 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
+using namespace arm_compute::misc;
+using namespace arm_compute::misc::shape_calculator;
 
 NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
-    : _kernel(), _output_stage_kernel(), _border_handler(), _accumulator(), _has_bias(false), _is_quantized(false)
+    : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _input_nhwc(), _weights_hwio(), _output_nhwc(), _has_bias(false),
+      _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false)
 {
 }
 
@@ -46,30 +50,61 @@
 
     _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
     _has_bias     = biases != nullptr;
+    _is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
+                                                                                          conv_info,
+                                                                                          input->info()->data_type());
+    _are_weights_reshaped = false;
 
-    // Allocate the intermediate accumulator tensor in case of fixed point input
-    if(_is_quantized)
+    if(_is_optimized)
     {
-        _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::S32));
-        _accumulator.info()->set_quantization_info(input->info()->quantization_info());
-        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+        // Configure the function to transform the input tensor from NCHW -> NHWC
+        _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+
+        // Configure the function to transform the weights tensor from IHW -> HWI
+        _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
+
+        // Configure optimized depthwise
+        _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, DataLayout::NHWC);
+
+        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+        _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
+
+        // Allocate tensors
+        _input_nhwc.allocator()->allocate();
+        _weights_hwio.allocator()->allocate();
+        _output_nhwc.allocator()->allocate();
+
+        // Create convolver (deferred)
+        _dwc_kernel.generate_convolver();
     }
+    else
+    {
+        // Allocate the intermediate accumulator tensor in case of fixed point input
+        if(_is_quantized)
+        {
+            _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::S32));
+            _accumulator.info()->set_quantization_info(input->info()->quantization_info());
+            zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+        }
 
-    // Configure depthwise convolution kernel
-    _kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info);
+        // Configure depthwise convolution kernel
+        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info);
 
-    // Configure border handler
-    _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, zero_value);
+        // Configure border handler
+        _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+    }
 
     // Configure biases accumulation
     if(_has_bias || _is_quantized)
     {
         if(_is_quantized)
         {
-            float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
+            const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+            float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
             int   output_multiplier, output_shift;
             quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-            _output_stage_kernel.configure(&_accumulator, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
+            _output_stage_kernel.configure(&_accumulator, biases, output, output_multiplier, output_shift, output_quant_info.offset);
             _accumulator.allocator()->allocate();
         }
         else
@@ -81,8 +116,35 @@
 
 void NEDepthwiseConvolutionLayer3x3::run()
 {
-    NEScheduler::get().schedule(&_border_handler, Window::DimX);
-    NEScheduler::get().schedule(&_kernel, Window::DimX);
+    // Permute weights in HWIO format if the optimized kernel will be executedd
+    if(!_are_weights_reshaped && _is_optimized)
+    {
+        _are_weights_reshaped = true;
+        _permute_weights.run();
+    }
+
+    // Handle input
+    if(_is_optimized)
+    {
+        // Permute input to NHWC format execution
+        _permute_input.run();
+    }
+    else
+    {
+        // Fill border in NCHW format execution
+        NEScheduler::get().schedule(&_border_handler, Window::DimX);
+    }
+
+    // Execute depthwise convolution
+    NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
+
+    // Permute output to ACL's native NCHW format in case of NHWC execution
+    if(_is_optimized)
+    {
+        _permute_output.run();
+    }
+
+    // Add biases
     if(_has_bias || _is_quantized)
     {
         NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
@@ -90,13 +152,14 @@
 }
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
-    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _input_reshaped(), _weights_reshaped(), _v2mm_output()
+    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
+      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_quantized(false)
 {
 }
 
 void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
 
@@ -104,14 +167,20 @@
     const size_t weights_h = weights->info()->dimension(1);
     const size_t weights_z = weights->info()->dimension(2);
 
-    bool has_bias = (biases != nullptr);
+    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
 
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info);
+    // Should bias be appended ?
+    bool append_bias = (biases != nullptr) && !_is_quantized;
+
+    // Calculate output shape
+    TensorShape dwc_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info);
+
+    // Output width and height
+    const unsigned int conv_w = dwc_output_shape.x();
+    const unsigned int conv_h = dwc_output_shape.y();
 
     // Set up intermediate tensors
-    const size_t patch_size = weights_w * weights_h + ((has_bias) ? 1 : 0);
+    const size_t patch_size = weights_w * weights_h + (append_bias ? 1 : 0);
     const size_t conv_size  = conv_w * conv_h;
 
     // Im2Col configuration
@@ -119,25 +188,50 @@
     shape_im2col.set(0, patch_size);
     shape_im2col.set(1, conv_size);
     shape_im2col.set(2, weights_z);
-    const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position());
-    _input_reshaped.allocator()->init(info_im2col);
-    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, has_bias);
+    _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias);
 
     // Weights reshape configuration
     const TensorShape shape_weights_reshape(patch_size, weights_z);
-    const TensorInfo  info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
-    _weights_reshaped.allocator()->init(info_weights_reshape);
-    _weights_reshape_kernel.configure(weights, &_weights_reshaped, biases);
+    _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+    _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
 
     // GEMV configuration
+    DataType    v2mm_dt        = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
     TensorShape shape_v2mm_out = input->info()->tensor_shape();
     shape_v2mm_out.set(0, conv_size * weights_z);
     shape_v2mm_out.set(1, 1);
     shape_v2mm_out.set(2, 1);
-    const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position());
-    _v2mm_output.allocator()->init(info_v2mm_out);
+    _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
     _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
-    _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h);
+    _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dwc_output_shape));
+    _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
+
+    // Output staged configuration
+    if(_is_quantized)
+    {
+        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+        int   output_multiplier, output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
+        _output_reshaped.allocator()->allocate();
+    }
+
+    // Fill borders on inputs
+    PixelValue zero_in(static_cast<int32_t>(0));
+    PixelValue zero_w(static_cast<int32_t>(0));
+    if(_is_quantized)
+    {
+        zero_in = PixelValue(static_cast<int32_t>(input->info()->quantization_info().offset));
+        zero_w  = PixelValue(static_cast<int32_t>(weights->info()->quantization_info().offset));
+    }
+    BorderSize border_size = _v2mm_kernel.border_size();
+    _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, zero_in);
+
+    border_size.bottom = 0;
+    _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, zero_w);
 
     // Allocate intermediate tensors
     _input_reshaped.allocator()->allocate();
@@ -149,6 +243,12 @@
 {
     NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
     NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
+    NEScheduler::get().schedule(&_v2mm_input_fill_border, Window::DimX);
+    NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
     NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX);
     NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX);
-}
\ No newline at end of file
+    if(_is_quantized)
+    {
+        NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
+    }
+}

diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
index 408eff5..32edf93 100644
--- a/src/runtime/NEON/functions/NEFlattenLayer.cpp
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,6 @@
 void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
-    k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+    k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, false, true);
     _kernel = std::move(k);
 }
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index fc04e28..26b7271 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,15 +23,18 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include <algorithm>
 #include <cmath>
 
-namespace arm_compute
-{
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
 NEFullyConnectedLayerReshapeWeights::NEFullyConnectedLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _transpose_kernel(), _transpose1xW_kernel(), _transpose_output(), _transpose_weights(false), _is_batched_fc_layer(false)
 {
@@ -39,13 +42,10 @@
 
 void NEFullyConnectedLayerReshapeWeights::configure(const ITensor *input, ITensor *output, bool transpose_weights, bool is_batched_fc_layer)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 2);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
-    ARM_COMPUTE_ERROR_ON(!transpose_weights && !is_batched_fc_layer);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    const DataType data_type            = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerReshapeWeights::validate(input->info(), output->info(), transpose_weights, is_batched_fc_layer));
 
     _transpose_weights   = transpose_weights;
     _is_batched_fc_layer = is_batched_fc_layer;
@@ -56,8 +56,7 @@
         if(_is_batched_fc_layer)
         {
             // Initialize the output tensor for transpose
-            TensorShape shape_transposed(input->info()->dimension(1), input->info()->dimension(0));
-            _transpose_output.allocator()->init(TensorInfo(shape_transposed, 1, data_type, fixed_point_position));
+            _transpose_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*input->info())));
             _memory_group.manage(&_transpose_output);
             _transpose_kernel.configure(input, &_transpose_output);
 
@@ -79,11 +78,39 @@
             // Configure transpose 1xW kernel
             _transpose1xW_kernel.configure(input, output);
         }
+    }
+}
+
+Status NEFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, const ITensorInfo *output, bool transpose_weights, bool is_batched_fc_layer)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!transpose_weights && !is_batched_fc_layer, "Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
+
+    if(transpose_weights)
+    {
+        if(is_batched_fc_layer)
+        {
+            std::unique_ptr<ITensorInfo> use_output = output->clone();
+            use_output->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*input));
+
+            ARM_COMPUTE_RETURN_ON_ERROR(NETransposeKernel::validate(input, use_output.get()));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(use_output.get(), output));
+        }
         else
         {
-            ARM_COMPUTE_ERROR("Configuration transpose_weights=false & is_batched_fc_layer=false not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(NETransposeKernel::validate(input, output));
         }
     }
+    else
+    {
+        if(is_batched_fc_layer)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(input, output));
+        }
+    }
+
+    return Status{};
 }
 
 void NEFullyConnectedLayerReshapeWeights::run()
@@ -122,26 +149,25 @@
     // Weights: flat(In) x Out
     // Biases: Out
     // Output: Out x B (B can be multi-dimensional)
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, weights, output);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(),
+                                                               weights->info(),
+                                                               biases != nullptr ? biases->info() : nullptr,
+                                                               output->info(),
+                                                               transpose_weights,
+                                                               are_weights_reshaped));
 
-    const DataType data_type            = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-    const int      num_batch_dimensions = std::max(0, static_cast<int>(output->info()->tensor_shape().num_dimensions()) - 1);
-    const int      num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
-    const size_t   linear_input_size    = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
+    const int    num_batch_dimensions = std::max(0, static_cast<int>(output->info()->tensor_shape().num_dimensions()) - 1);
+    const int    num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
+    const size_t linear_input_size    = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
 
     _linearize_input      = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
     _are_weights_reshaped = are_weights_reshaped;
     _accumulate_biases    = biases != nullptr;
     _is_batched_fc_layer  = num_batch_dimensions > 0;
 
-    // Check if number of batches match
-    ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size_upper(num_input_dimensions) != output->info()->tensor_shape().total_size_upper(1));
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2);
-
     const size_t   interleave_width = 16 / input->info()->element_size();
     const ITensor *weights_to_use   = weights;
 
@@ -149,65 +175,33 @@
     {
         weights_to_use = &_reshape_weights_output;
 
-        TensorShape reshaped_weights_shape(weights->info()->tensor_shape());
-
-        // Transpose weights if the user hasn't done it
-        if(transpose_weights)
-        {
-            const size_t shape_x = reshaped_weights_shape.x();
-            reshaped_weights_shape.set(0, reshaped_weights_shape.y());
-            reshaped_weights_shape.set(1, shape_x);
-        }
-
-        // If the we run multiple batches we need 1xW transpose, too.
-        if(_is_batched_fc_layer)
-        {
-            const float shape_x = reshaped_weights_shape.x();
-            reshaped_weights_shape.set(0, reshaped_weights_shape.y() * interleave_width);
-            reshaped_weights_shape.set(1, static_cast<unsigned int>(std::ceil(shape_x / interleave_width)));
-        }
-
-        _reshape_weights_output.allocator()->init(TensorInfo(reshaped_weights_shape, 1, data_type, fixed_point_position));
+        _reshape_weights_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_fully_connected_reshaped_weights_shape(weights->info(),
+                                                  transpose_weights,
+                                                  _is_batched_fc_layer, interleave_width)));
 
         // Reshape the weights
         _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
     }
 
-    // Check correct shape of weights
-    if(_is_batched_fc_layer)
-    {
-        // Transpose + Transpose1xW
-        ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().x() != linear_input_size * interleave_width);
-        ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().y() != static_cast<unsigned int>(std::ceil(static_cast<float>(output->info()->tensor_shape().x()) / interleave_width)));
-    }
-    else
-    {
-        // Transpose
-        ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().x() != output->info()->tensor_shape().x());
-        ARM_COMPUTE_ERROR_ON(weights_to_use->info()->tensor_shape().y() != linear_input_size);
-    }
-
     const ITensor *multiply_input = input;
 
     if(_linearize_input)
     {
-        TensorShape shape_im2col(input->info()->tensor_shape());
-        shape_im2col.collapse(num_input_dimensions);
-        _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, data_type, fixed_point_position));
+        _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_shape(input->info(), num_input_dimensions)));
 
         // Configure im2col kernel
         _memory_group.manage(&_im2col_output);
-        _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+        _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, true);
 
         multiply_input = &_im2col_output;
     }
 
+    int m = multiply_input->info()->dimension(1);
+    int k = multiply_input->info()->dimension(0);
+
     if(_is_batched_fc_layer)
     {
-        TensorShape shape_interleaved(multiply_input->info()->tensor_shape());
-        shape_interleaved.set(0, shape_interleaved.x() * 4);
-        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
-        _interleave4x4_output.allocator()->init(TensorInfo(shape_interleaved, 1, data_type, fixed_point_position));
+        _interleave4x4_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_interleaved_shape(*multiply_input->info())));
 
         // Configure interleave4x4 kernel
         _memory_group.manage(&_interleave4x4_output);
@@ -217,13 +211,10 @@
     }
 
     // Configure matrix multiply kernel
-    _mm_kernel.configure(multiply_input, weights_to_use, output, 1.0f);
+    _mm_kernel.configure(multiply_input, weights_to_use, output, 1.0f, _is_batched_fc_layer, GEMMReshapeInfo(m, 0 /* no transpose */, k));
 
     if(_accumulate_biases)
     {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->tensor_shape().x() != output->info()->tensor_shape().x());
-
         // Configure accumulate biases kernel
         _accumulate_biases_kernel.configure(output, biases);
     }
@@ -246,6 +237,88 @@
     }
 }
 
+Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose_weights, bool are_weights_reshaped)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, weights, output);
+
+    const int    num_batch_dimensions = std::max(0, static_cast<int>(output->tensor_shape().num_dimensions()) - 1);
+    const int    num_input_dimensions = input->tensor_shape().num_dimensions() - num_batch_dimensions;
+    const size_t linear_input_size    = input->tensor_shape().total_size_lower(num_input_dimensions);
+
+    const bool linearize_input     = (input->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
+    const bool accumulate_biases   = biases != nullptr;
+    const bool is_batched_fc_layer = num_batch_dimensions > 0;
+
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size_upper(num_input_dimensions) != output->tensor_shape().total_size_upper(1));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+
+    const size_t                 interleave_width       = 16 / input->element_size();
+    const ITensorInfo           *weights_to_use         = weights;
+    std::unique_ptr<ITensorInfo> reshape_weights_output = input->clone();
+
+    if(!are_weights_reshaped && (transpose_weights || is_batched_fc_layer))
+    {
+        reshape_weights_output->set_tensor_shape(compute_fully_connected_reshaped_weights_shape(weights, transpose_weights, is_batched_fc_layer, interleave_width));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayerReshapeWeights::validate(weights, reshape_weights_output.get(), transpose_weights, is_batched_fc_layer));
+
+        weights_to_use = reshape_weights_output.get();
+    }
+
+    // Check correct shape of weights
+    if(is_batched_fc_layer)
+    {
+        // Transpose + Transpose1xW
+        ARM_COMPUTE_RETURN_ERROR_ON(weights_to_use->tensor_shape().x() != linear_input_size * interleave_width);
+        ARM_COMPUTE_RETURN_ERROR_ON(weights_to_use->tensor_shape().y() != static_cast<unsigned int>(std::ceil(static_cast<float>(output->tensor_shape().x()) / interleave_width)));
+    }
+    else
+    {
+        // Transpose
+        ARM_COMPUTE_RETURN_ERROR_ON(weights_to_use->tensor_shape().x() != output->tensor_shape().x());
+        ARM_COMPUTE_RETURN_ERROR_ON(weights_to_use->tensor_shape().y() != linear_input_size);
+    }
+
+    const ITensorInfo           *multiply_input       = input;
+    std::unique_ptr<ITensorInfo> im2col_output        = input->clone();
+    std::unique_ptr<ITensorInfo> interleave4x4_output = input->clone();
+
+    if(linearize_input)
+    {
+        im2col_output->set_tensor_shape(compute_im2col_shape(input, num_input_dimensions));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, im2col_output.get(), Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, true));
+
+        multiply_input = im2col_output.get();
+    }
+
+    int m = multiply_input->dimension(1);
+    int k = multiply_input->dimension(0);
+
+    if(is_batched_fc_layer)
+    {
+        interleave4x4_output->set_tensor_shape(compute_interleaved_shape(*multiply_input));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(multiply_input, interleave4x4_output.get()));
+
+        multiply_input = interleave4x4_output.get();
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(multiply_input, weights_to_use, output, 1.0f, is_batched_fc_layer, GEMMReshapeInfo(m, 0 /* no transpose */, k)));
+
+    if(accumulate_biases)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->tensor_shape().x() != output->tensor_shape().x());
+
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
+    }
+
+    return Status{};
+}
+
 void NEFullyConnectedLayer::run()
 {
     // Reshape of the weights (happens only once)
@@ -280,4 +353,3 @@
 
     _memory_group.release();
 }
-} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index e640b06..05907ba 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
 #include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h"
 #include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
@@ -38,10 +39,16 @@
 
 namespace arm_compute
 {
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wswitch-default"
+#pragma GCC diagnostic ignored "-Weffc++"
 #include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp"
 #include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
 #include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp"
 #include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp"
+#pragma GCC diagnostic pop
 } // namespace arm_compute
 
 #include <cmath>
@@ -80,8 +87,41 @@
     // If so, all the kernels for reshaping the tensors can be skipped
     if(_run_vector_matrix_multiplication)
     {
-        // Configure the matrix multiply kernel
-        _mm_kernel.configure(a, b, d, alpha);
+#if defined(__aarch64__)
+        if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
+        {
+            _mm_optimised_kernel = support::cpp14::make_unique<NEGEMVAArch64Kernel>();
+        }
+
+        if(_mm_optimised_kernel != nullptr)
+        {
+            struct CPUInfo ci = NEScheduler::get().cpu_info();
+
+            const int N = d->info()->tensor_shape().x();
+            const int K = a->info()->tensor_shape().x();
+
+            size_t workbench_size = 0;
+
+            if(a->info()->data_type() == DataType::F32)
+            {
+                workbench_size = GemvTransposed<sgemv_trans, sgemv_trans::operand_type, sgemv_trans::result_type>(&ci, N, K).get_working_size();
+            }
+
+            constexpr size_t alignment = 4096;
+            ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
+            _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
+            _memory_group.manage(&_workspace);
+
+            // Configure matrix multiplication kernel
+            _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */);
+            _workspace.allocator()->allocate();
+        }
+        else
+#endif /* defined(__aarch64__) */
+        {
+            // Configure the matrix multiply kernel
+            _mm_kernel.configure(a, b, d, alpha, false);
+        }
 
         // Configure matrix addition kernel
         if(beta != 0 && c != nullptr)
@@ -172,6 +212,10 @@
             _memory_group.manage(&_tmp_a);
             _memory_group.manage(&_tmp_b);
 
+            int m = a->info()->dimension(1);
+            int n = b->info()->dimension(0);
+            int k = a->info()->dimension(0);
+
             // Configure interleave kernel
             _interleave_kernel.configure(a, &_tmp_a);
 
@@ -179,7 +223,7 @@
             _transpose_kernel.configure(b, &_tmp_b);
 
             // Configure matrix multiplication kernel
-            _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha);
+            _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha, true, GEMMReshapeInfo(m, n, k));
 
             // Allocate once the all configure methods have been called
             _tmp_a.allocator()->allocate();

diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
new file mode 100644
index 0000000..a85078c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp

@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
+
+#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
+} // namespace arm_compute
+
+#include <cmath>
+#include <tuple>
+
+namespace
+{
+arm_compute::TensorShape get_reshaped_weights_shape(const arm_compute::ITensorInfo *weights, bool append_bias)
+{
+    const unsigned int mat_weights_cols = weights->dimension(3);
+    const unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (append_bias ? 1 : 0);
+    return arm_compute::TensorShape(mat_weights_cols, mat_weights_rows);
+}
+} // namespace
+
+namespace arm_compute
+{
+NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+{
+}
+
+void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW)
+{
+    // Perform validation step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayerReshapeWeights::validate(weights->info(),
+                                                                          (biases != nullptr) ? biases->info() : nullptr,
+                                                                          output->info(),
+                                                                          transpose1xW));
+
+    // Check if bias are present, if yes they will be embedded to the weights matrix
+    const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
+    //const unsigned bias_element  = (append_biases) ? 1 : 0;
+    const ITensor *biases_to_use = (append_biases) ? biases : nullptr;
+
+    _transpose1xW = transpose1xW;
+
+    if(transpose1xW)
+    {
+        // Create tensor to store the reshaped weights
+        TensorInfo info_wr = weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(get_reshaped_weights_shape(weights->info(), append_biases));
+
+        _weights_reshaped.allocator()->init(info_wr);
+        _memory_group.manage(&_weights_reshaped);
+
+        _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+        _weights_transposed_kernel.configure(&_weights_reshaped, output);
+
+        _weights_reshaped.allocator()->allocate();
+    }
+    else
+    {
+        _weights_reshape_kernel.configure(weights, biases_to_use, output);
+    }
+
+    output->info()->set_quantization_info(weights->info()->quantization_info());
+}
+
+Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose1xW)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+    if(!is_data_type_quantized_asymmetric(weights->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
+    }
+    // Check if bias are present, if yes they will be embedded to the weights matrix
+    const bool append_bias = (biases != nullptr);
+
+    if(append_bias)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(weights->data_type()));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    // Checks performed when biases are present
+    if(append_bias)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    if(transpose1xW)
+    {
+        TensorInfo weights_reshaped = weights->clone()->set_tensor_shape(get_reshaped_weights_shape(weights, append_bias));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(&weights_reshaped, output));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, output));
+    }
+
+    return Status{};
+}
+
+void NEConvolutionLayerReshapeWeights::run()
+{
+    _memory_group.acquire();
+
+    NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+
+    if(_transpose1xW)
+    {
+        NEScheduler::get().schedule(&_weights_transposed_kernel, Window::DimY);
+    }
+
+    _memory_group.release();
+}
+
+namespace
+{
+TensorShape get_reshaped_weights_shape_conv(const ITensorInfo *weights, bool append_bias, bool is_fully_connected_convolution)
+{
+    unsigned int mat_weights_cols = weights->dimension(3);
+    unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (append_bias ? 1 : 0);
+
+    if(is_fully_connected_convolution)
+    {
+        // Create tensor to store the reshaped weights
+        return TensorShape(mat_weights_cols, mat_weights_rows);
+    }
+    else
+    {
+        // Create tensor to store transposed weights
+        const float transpose_width = 16.0f / weights->element_size();
+        return TensorShape(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
+    }
+}
+
+Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, DataType &dt,
+                                      bool &append_bias,
+                                      bool &are_weights_reshaped, unsigned int &kernel_width, unsigned int &kernel_height,
+                                      bool &is_fully_connected_convolution, bool &is_interleaved, bool &is_quantized,
+                                      unsigned int &mat_weights_cols, unsigned int &mat_weights_rows,
+                                      unsigned int &conv_w, unsigned int &conv_h)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && weights->dimension(2) != input->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights_info.are_reshaped() && is_data_type_quantized_asymmetric(input->data_type()));
+
+    dt           = input->data_type();
+    is_quantized = is_data_type_quantized_asymmetric(dt);
+
+    if(biases != nullptr)
+    {
+        if(is_quantized)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    append_bias          = (biases != nullptr) && (!is_quantized);
+    are_weights_reshaped = weights_info.are_reshaped();
+    kernel_width         = (are_weights_reshaped) ? weights_info.kernel_size().first : weights->dimension(0);
+    kernel_height        = (are_weights_reshaped) ? weights_info.kernel_size().second : weights->dimension(1);
+    mat_weights_cols     = weights->dimension(3);
+    mat_weights_rows     = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (append_bias ? 1 : 0);
+
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
+
+    // Check if its a "fully connected" convolution
+    is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+    is_interleaved                 = (!is_fully_connected_convolution && !is_quantized);
+
+    return Status{};
+}
+} // namespace
+
+NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
+    : _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_optimised_kernel(nullptr), _mm_gemmlowp(memory_manager),
+      _gemmlowp_output_stage(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _tmp_output(), _workspace(), _append_bias(false),
+      _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false), _is_interleaved(false)
+{
+}
+
+void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, ITensor *output, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
+{
+    if(_is_quantized)
+    {
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate input and weights offset
+        const QuantizationInfo input_quantization_info   = input->info()->quantization_info();
+        const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
+
+        input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
+        weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+
+        _mm_gemmlowp.configure(input, weights, output, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+
+        // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
+        input->info()->set_quantization_info(input_quantization_info);
+        weights->info()->set_quantization_info(weights_quantization_info);
+    }
+    else
+    {
+        _mm_kernel.configure(input, weights, output, 1.f, is_interleaved, reshape_info);
+    }
+}
+
+void NEGEMMConvolutionLayer::configure_asm_mm(const struct CPUInfo &ci, int M, int N, int K)
+{
+    ARM_COMPUTE_UNUSED(ci);
+    ARM_COMPUTE_UNUSED(M);
+    ARM_COMPUTE_UNUSED(N);
+    ARM_COMPUTE_UNUSED(K);
+#if defined(__arm__) || defined(__aarch64__)
+#if defined(__arm__)
+    GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
+#elif defined(__aarch64__)
+    GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
+    constexpr size_t alignment = 4096;
+    _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+    _memory_group.manage(&_workspace);
+#endif /* defined(__arm__) || defined(__aarch64__) */
+}
+
+void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+    DataType     dt{};
+    unsigned int kernel_width     = 0;
+    unsigned int kernel_height    = 0;
+    unsigned int mat_weights_cols = 0;
+    unsigned int mat_weights_rows = 0;
+    unsigned int conv_w           = 0;
+    unsigned int conv_h           = 0;
+
+    Status status = validate_and_initialize_values(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), conv_info, weights_info, dt, _append_bias, _are_weights_reshaped,
+                                                   kernel_width, kernel_height,
+                                                   _is_fully_connected_convolution, _is_interleaved, _is_quantized,
+                                                   mat_weights_cols, mat_weights_rows, conv_w, conv_h);
+
+    ARM_COMPUTE_ERROR_THROW_ON(status);
+
+    const unsigned int fixed_point_position = input->info()->fixed_point_position();
+    const ITensor     *biases_to_use        = (_append_bias) ? biases : nullptr;
+
+#if defined(__arm__)
+    if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
+    {
+        _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
+    }
+#elif defined(__aarch64__)
+    if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
+    {
+        _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
+    }
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
+    // Reshape weights if needed
+    if(_mm_optimised_kernel != nullptr)
+    {
+        if(_are_weights_reshaped)
+        {
+            mat_weights_cols = weights_info.num_kernels();
+            mat_weights_rows = weights->info()->dimension(1);
+        }
+        else
+        {
+            TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
+
+            // Create tensor to store the reshaped weights
+            _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
+            _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+            weights = &_weights_reshaped;
+        }
+    }
+    else
+    {
+        if(_are_weights_reshaped)
+        {
+            if(_is_fully_connected_convolution || _is_quantized)
+            {
+                mat_weights_cols = weights_info.num_kernels();
+                mat_weights_rows = weights->info()->dimension(1);
+            }
+            else
+            {
+                mat_weights_cols = weights_info.num_kernels();
+                mat_weights_rows = weights_info.kernel_size().first * weights_info.kernel_size().second * input->info()->dimension(2) + (_append_bias ? 1 : 0);
+            }
+        }
+        else
+        {
+            TensorShape reshaped_weights_shape;
+
+            if(_is_fully_connected_convolution || _is_quantized)
+            {
+                reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
+            }
+            else
+            {
+                // Create tensor to store transposed weights
+                const float transpose_width = 16.0f / input->info()->element_size();
+                reshaped_weights_shape      = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
+                                                           static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
+            }
+
+            // Create tensor to store the reshaped weights
+            _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
+            _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, _is_interleaved /* 1xW transpose */);
+            weights = &_weights_reshaped;
+        }
+    }
+
+    // Create tensor to store im2col reshaped inputs
+    const unsigned int mat_input_cols = mat_weights_rows;
+    const unsigned int mat_input_rows = conv_w * conv_h;
+
+    TensorShape shape_im2col(input->info()->tensor_shape());
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+    _input_im2col_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+    _memory_group.manage(&_input_im2col_reshaped);
+
+    // Create tensor (interleave) to prepare input tensor for GEMM
+    if(!_is_fully_connected_convolution && _mm_optimised_kernel == nullptr)
+    {
+        TensorShape shape_interleaved(shape_im2col);
+        shape_interleaved.set(0, shape_interleaved.x() * 4);
+        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+        _input_interleaved_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_interleaved));
+        _memory_group.manage(&_input_interleaved_reshaped);
+    }
+
+    // Create GEMM output tensor
+    TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    const DataType gemm_data_type = _is_quantized ? DataType::S32 : dt;
+    // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
+    TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
+    info_gemm.set_quantization_info(output->info()->quantization_info());
+    _gemm_output.allocator()->init(info_gemm);
+    _memory_group.manage(&_gemm_output);
+
+    // Configure kernels
+    // Configure im2col
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias);
+
+    // Configure matrix multiply
+    if(_mm_optimised_kernel != nullptr)
+    {
+        struct CPUInfo ci = NEScheduler::get().cpu_info();
+
+        const int M = _gemm_output.info()->tensor_shape().y();
+        const int N = _gemm_output.info()->tensor_shape().x();
+        const int K = _input_im2col_reshaped.info()->tensor_shape().x();
+
+#if defined(__aarch64__)
+        if((N <= 128) && (K <= 128))
+        {
+            _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64NativeKernel>();
+        }
+        else
+#endif /* defined(__aarch64__) */
+        {
+            configure_asm_mm(ci, M, N, K);
+        }
+
+        // Configure matrix multiplication kernel
+        _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
+
+        _workspace.allocator()->allocate();
+    }
+    else
+    {
+        if(_is_interleaved)
+        {
+            // Configure GEMMInterleave4x4. _input_interleaved_reshaped will be auto configured in the kernel
+            _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+
+            // Configure GEMM
+            configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output, _is_interleaved, GEMMReshapeInfo(_input_im2col_reshaped.info()->dimension(1), 0 /* no transpose */,
+                                                                                                                _input_im2col_reshaped.info()->dimension(0)));
+            _input_interleaved_reshaped.allocator()->allocate();
+        }
+        else
+        {
+            configure_mm(&_input_im2col_reshaped, weights, &_gemm_output, _is_interleaved);
+        }
+    }
+
+    _input_im2col_reshaped.allocator()->allocate();
+
+    // Configure output stage for quantized case
+    if(_is_quantized)
+    {
+        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+        int   output_multiplier, output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        _memory_group.manage(&_tmp_output);
+        _gemmlowp_output_stage.configure(&_gemm_output, biases, &_tmp_output, output_multiplier, output_shift, output_quant_info.offset);
+    }
+
+    // Configure Col2Im
+    _output_col2im_kernel.configure(_is_quantized ? &_tmp_output : &_gemm_output, output, Size2D(conv_w, conv_h));
+    if(_is_quantized)
+    {
+        _tmp_output.allocator()->allocate();
+    }
+    _gemm_output.allocator()->allocate();
+
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+
+    // Allocate intermediate tensor
+    if(!_are_weights_reshaped)
+    {
+        _weights_reshaped.allocator()->allocate();
+    }
+}
+
+Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                        const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_UNUSED(output);
+
+    DataType     dt{};
+    bool         append_bias{};
+    bool         are_weights_reshaped{};
+    bool         is_fully_connected_convolution{};
+    bool         is_interleaved{};
+    bool         is_quantized{};
+    unsigned int kernel_width     = 0;
+    unsigned int kernel_height    = 0;
+    unsigned int mat_weights_cols = 0;
+    unsigned int mat_weights_rows = 0;
+    unsigned int conv_w           = 0;
+    unsigned int conv_h           = 0;
+
+    Status status = validate_and_initialize_values(input, weights, biases, conv_info, weights_info, dt, append_bias, are_weights_reshaped, kernel_width, kernel_height,
+                                                   is_fully_connected_convolution, is_interleaved, is_quantized, mat_weights_cols, mat_weights_rows,
+                                                   conv_w, conv_h);
+
+    const Size2D kernel_weights = Size2D(kernel_width, kernel_height);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(status);
+
+    std::unique_ptr<ITensorInfo> reshaped_weights = weights->clone();
+    bool                         optimised_kernel = false;
+
+#if defined(__arm__)
+    if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
+    {
+        optimised_kernel = true;
+    }
+#elif defined(__aarch64__)
+    if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
+    {
+        optimised_kernel = true;
+    }
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
+    // Reshape weights if needed
+    if(optimised_kernel)
+    {
+        if(are_weights_reshaped)
+        {
+            mat_weights_cols = weights_info.num_kernels();
+            mat_weights_rows = weights->dimension(1);
+        }
+        else
+        {
+            TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
+
+            // Create tensor to store the reshaped weights
+            reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
+            weights = reshaped_weights.get();
+        }
+    }
+    else
+    {
+        if(are_weights_reshaped)
+        {
+            const unsigned int transpose_width = 16 / input->element_size();
+            mat_weights_cols                   = weights_info.num_kernels();
+            mat_weights_rows                   = weights->dimension(0) / transpose_width + (append_bias ? 1 : 0);
+        }
+        else
+        {
+            TensorShape reshaped_weights_shape;
+
+            if(is_fully_connected_convolution || is_quantized)
+            {
+                reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
+            }
+            else
+            {
+                // Create tensor to store transposed weights
+                const float transpose_width = 16.0f / input->element_size();
+                reshaped_weights_shape      = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
+                                                           static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
+            }
+
+            // Create tensor to store the reshaped weights
+            reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
+            weights = reshaped_weights.get();
+        }
+    }
+
+    // Validate im2col
+    const unsigned int mat_input_cols = mat_weights_rows;
+    const unsigned int mat_input_rows = conv_w * conv_h;
+    TensorShape        shape_im2col   = input->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+    TensorInfo im2_col_info = input->clone()->set_tensor_shape(shape_im2col);
+    ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2_col_info, kernel_weights, conv_info, append_bias, false));
+
+    // Create GEMM output tensor
+    TensorShape shape_gemm(im2_col_info.tensor_shape());
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    TensorInfo gemm_output_info = input->clone()->set_tensor_shape(shape_gemm);
+
+    // Validate GEMM interleave and multiply
+    if(is_interleaved)
+    {
+        TensorShape shape_interleaved = shape_im2col;
+        shape_interleaved.set(0, shape_interleaved.x() * 4);
+        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+        TensorInfo input_interleaved_info = input->clone()->set_tensor_shape(shape_interleaved);
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(&im2_col_info, &input_interleaved_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&input_interleaved_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo()));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&im2_col_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo()));
+    }
+
+    return Status{};
+}
+
+void NEGEMMConvolutionLayer::run()
+{
+    // Run weights reshaping (Runs once for every configure)
+    if(!_are_weights_reshaped)
+    {
+        _are_weights_reshaped = true;
+        _reshape_weights.run();
+    }
+
+    _memory_group.acquire();
+
+    // Run input reshaping
+    NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+
+    // Runs matrix multiply on reshaped matrices
+    if(_mm_optimised_kernel != nullptr)
+    {
+        NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+    }
+    else
+    {
+        if(_is_interleaved)
+        {
+            // Run interleave
+            NEScheduler::get().schedule(&_input_interleave_kernel, Window::DimY);
+        }
+
+        // Runs matrix multiply on reshaped matrices
+        if(_is_quantized)
+        {
+            _mm_gemmlowp.run();
+        }
+        else
+        {
+            NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
+        }
+    }
+
+    // Run output stage for quantized case
+    if(_is_quantized)
+    {
+        _gemmlowp_output_stage.run();
+    }
+
+    // Reshape output matrix
+    NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+
+    _memory_group.release();
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index c4028dc..ad47593 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "support/ToolchainSupport.h"
@@ -45,6 +46,7 @@
 } // namespace arm_compute
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
 
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
@@ -102,17 +104,9 @@
         else
         {
             // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-            TensorShape shape_tmp_a = a->info()->tensor_shape();
-            shape_tmp_a.set(0, a->info()->dimension(0) * 4);
-            shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
-
+            TensorInfo info_a(compute_interleaved_shape(*a->info()), 1, a->info()->data_type());
             // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-            TensorShape shape_tmp_b = b->info()->tensor_shape();
-            shape_tmp_b.set(0, b->info()->dimension(1) * 16);
-            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
-
-            TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
-            TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+            TensorInfo info_b(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type());
             _tmp_a.allocator()->init(info_a);
             _tmp_b.allocator()->init(info_b);
             _memory_group.manage(&_tmp_a);
@@ -144,12 +138,8 @@
     // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
     if(_a_offset != 0)
     {
-        TensorShape shape_vector_sum_col = b->info()->tensor_shape();
-        if(b->info()->num_dimensions() > 1)
-        {
-            shape_vector_sum_col.remove_dimension(1);
-        }
-        TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);
+        TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
+
         _vector_sum_col.allocator()->init(info_vector_sum_col);
         _memory_group.manage(&_vector_sum_col);
 
@@ -160,13 +150,8 @@
     // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
     if(_b_offset != 0)
     {
-        TensorShape shape_vector_sum_row = a->info()->tensor_shape();
-        shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));
-        if(a->info()->num_dimensions() > 1)
-        {
-            shape_vector_sum_row.remove_dimension(1);
-        }
-        TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);
+        TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);
+
         _vector_sum_row.allocator()->init(info_vector_sum_row);
         _memory_group.manage(&_vector_sum_row);
 
@@ -261,9 +246,7 @@
     // Validate matrix B reduction kernel only if _a_offset is not equal to 0
     if(a_offset != 0)
     {
-        TensorShape shape_vector_sum_col = b->tensor_shape();
-        shape_vector_sum_col.remove_dimension(1);
-        info_vector_sum_col = TensorInfo(shape_vector_sum_col, 1, DataType::S32);
+        info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
 
         // Configure Matrix B reduction kernel
         ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, a->dimension(0), false));
@@ -272,10 +255,7 @@
     // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
     if(b_offset != 0)
     {
-        TensorShape shape_vector_sum_row = a->tensor_shape();
-        shape_vector_sum_row.set(Window::DimX, a->dimension(1));
-        shape_vector_sum_row.remove_dimension(1);
-        info_vector_sum_row = TensorInfo(shape_vector_sum_row, 1, DataType::S32);
+        info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
 
         // Configure matrix A reduction kernel
         ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));

diff --git a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
index 571bf2b..802b946 100644
--- a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
+++ b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,3 +38,7 @@
     k->configure(input, output);
     _kernel = std::move(k);
 }
+Status NEGEMMTranspose1xW::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return NEGEMMTranspose1xWKernel::validate(input, output);
+}

diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
index 8e90e66..b962db9 100644
--- a/src/runtime/NEON/functions/NEIm2Col.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,14 +28,14 @@
 
 using namespace arm_compute;
 
-void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
-    k->configure(input, output, kernel_dims, conv_info, has_bias);
+    k->configure(input, output, kernel_dims, conv_info, has_bias, is_fully_connected);
     _kernel = std::move(k);
 }
 
-Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected)
 {
-    return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias);
+    return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, is_fully_connected);
 }

diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
index 0893701..9ad9689 100644
--- a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
+++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,7 +42,7 @@
 {
 }
 
-void NELaplacianReconstruct::configure(const IPyramid *pyramid, const ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
+void NELaplacianReconstruct::configure(const IPyramid *pyramid, ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == pyramid);
     ARM_COMPUTE_ERROR_ON(input == output);

diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index b29b796..45ddb70 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,6 +46,7 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(!conv_info.padding_is_symmetric());
 
     if(biases != nullptr)
     {

diff --git a/src/runtime/NEON/functions/NEPermute.cpp b/src/runtime/NEON/functions/NEPermute.cpp
new file mode 100644
index 0000000..92abd03
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPermute.cpp

@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+
+#include "arm_compute/core/NEON/kernels/NEPermuteKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEPermute::configure(const ITensor *input, ITensor *output, const PermutationVector &perm)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEPermuteKernel>();
+    k->configure(input, output, perm);
+    _kernel = std::move(k);
+}
+
+Status NEPermute::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
+{
+    return NEPermuteKernel::validate(input, output, perm);
+}

diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 8a32507..bc0b6f8 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp

@@ -38,7 +38,7 @@
 void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
 {
     // Check if we have Global Pooling Layer
-    _is_global_pooling_layer = (input->info()->dimension(0) == pool_info.pool_size()) && (input->info()->dimension(1) == pool_info.pool_size());
+    _is_global_pooling_layer = (input->info()->dimension(0) == pool_info.pool_size().width) && (input->info()->dimension(1) == pool_info.pool_size().height);
 
     // Configure pooling kernel
     _pooling_layer_kernel.configure(input, output, pool_info);

diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 8e6773c..4fb8300 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 using namespace arm_compute;
 
 NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _max(), _sum(), _tmp()
+    : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _fill_border_kernel(), _max(), _tmp()
 {
 }
 
@@ -40,31 +40,22 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    // Create intermediate tensors shapes
-    TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
-    _tmp.allocator()->init(tensor_info_tmp);
-
-    TensorShape shape = input->info()->tensor_shape();
-    shape.set(0, 1);
-    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
-    _max.allocator()->init(tensor_info_max_sum);
-    _sum.allocator()->init(tensor_info_max_sum);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_tmp);
-    _memory_group.manage(&_max);
-    _memory_group.manage(&_sum);
-
     // Configure Kernels
     _max_kernel.configure(input, &_max);
-    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
-    _norm_kernel.configure(&_tmp, &_sum, output);
     _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::REPLICATE);
+    _softmax_kernel.configure(input, &_max, output, beta, &_tmp);
+
+    // Init intermediate tensors
+    _max.allocator()->init(*_max.info());
+    _tmp.allocator()->init(*_tmp.info());
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_max);
+    _memory_group.manage(&_tmp);
 
     // Allocate intermediate tensors
-    _tmp.allocator()->allocate();
     _max.allocator()->allocate();
-    _sum.allocator()->allocate();
+    _tmp.allocator()->allocate();
 }
 
 Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta)
@@ -72,14 +63,12 @@
     // Perform validation step
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape max_sum_shape = input->tensor_shape();
-    max_sum_shape.set(0, 1);
-
-    TensorInfo tensor_info_max_sum(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(max_sum_shape));
+    const TensorShape max_shape           = TensorShape(input->tensor_shape()).set(0, 1);
+    const TensorInfo  tensor_info_max_sum = TensorInfo(*input).set_tensor_shape(max_shape).reset_padding();
+    const TensorInfo  dont_care;
 
     ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DShiftExpSumKernel::validate(input, &tensor_info_max_sum, input, &tensor_info_max_sum, beta));
-    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DNormKernel::validate(input, &tensor_info_max_sum, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel::validate(input, &tensor_info_max_sum, output, beta, &dont_care));
 
     return Status{};
 }
@@ -90,8 +79,7 @@
 
     NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
     NEScheduler::get().schedule(&_max_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_shift_exp_sum_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_softmax_kernel, Window::DimY);
 
     _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp
index da46f87..0ac6d09 100644
--- a/src/runtime/NEON/functions/NEWinogradLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradLayer.cpp

@@ -23,11 +23,16 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEWinogradLayer.h"
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "support/ToolchainSupport.h"
 
+#include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h"
+
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+
 namespace
 {
 inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
@@ -42,29 +47,83 @@
 
 namespace arm_compute
 {
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, biases);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != 3 && weights->dimension(0) != 5, "Only 3 and 5 kernels are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    // Get parameters from conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides.");
+
+    ARM_COMPUTE_UNUSED(output);
+
+    return Status{};
+}
+} //namespace
+
 NEWinogradLayer::NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _winograd_kernel(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(),
-      _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(), _reshaped_kernel(false), _conv()
+    : _memory_group(std::move(memory_manager)), _batched_gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _permute_input(),
+      _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(),
+      _reshaped_kernel(false)
 {
 } /* arm_compute */
 
 void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(1) != 3 || weights->info()->dimension(0) != 3, "Only 3x3 kernels are supported");
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
-    }
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, biases, output);
+    ARM_COMPUTE_UNUSED(conv_info);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), biases->info(), output->info(), conv_info));
 
     _weights = weights;
     _input   = input;
     _output  = output;
 
+    std::unique_ptr<INEWinogradLayerBatchedGEMMKernel<float, float>> batched_gemm_kernel;
+    std::unique_ptr<INEWinogradLayerTransformInputKernel<float>>   transform_input_kernel;
+    std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
+    std::unique_ptr<INEWinogradLayerTransformOutputKernel<float>>  transform_output_kernel;
+
+    switch(weights->info()->dimension(0))
+    {
+        case 3:
+        {
+            batched_gemm_kernel      = support::cpp14::make_unique<NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>>();
+            transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>>();
+            transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>>();
+            transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>>();
+            break;
+        }
+        case 5:
+        {
+            batched_gemm_kernel      = support::cpp14::make_unique<NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>>();
+            transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>>();
+            transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>>();
+            transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>>();
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+        }
+    }
+
+    const PaddingType use_padding_type = (conv_info.pad_left() != 0u) ? PADDING_SAME : PADDING_VALID;
+    const bool        use_same_padding = use_padding_type == PADDING_SAME;
+
     // Get parameters from conv_info
     unsigned int stride_x = 0;
     unsigned int stride_y = 0;
@@ -76,24 +135,20 @@
     const int out_channels = output->info()->dimension(2);
 
     const Tensor4DShape in_shape(internal_get_input_shape(input));
-
+    const size_t        data_type_size = input->info()->element_size();
     // Get the memory required to instantiate a new Winograd operator.
     constexpr size_t storage_alignment   = 64;
-    const size_t     kernel_storage_size = NEWinogradLayerKernel::get_weight_storage_size(out_channels, in_channels) * sizeof(float);
+    const size_t     kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
     _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
-    _memory_group.manage(&_kernel_storage);
-    _memory_group.manage(&_input_nhwc);
     _kernel_storage.allocator()->allocate();
     // Input storage
-    const size_t input_storage_size = NEWinogradLayerKernel::get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, false) * sizeof(float);
+    const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
     _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
-    _memory_group.manage(&_input_workspace);
     _input_workspace.allocator()->allocate();
 
     // Output storage
-    const size_t output_storage_size = NEWinogradLayerKernel::get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, false) * sizeof(float);
+    const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, use_same_padding) * data_type_size;
     _output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8));
-    _memory_group.manage(&_output_workspace);
     _output_workspace.allocator()->allocate();
 
     // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
@@ -101,57 +156,67 @@
                                 _output->info()->dimension(1), _output->info()->dimension(3)),
                     1, _output->info()->data_type());
     _output_nhwc.allocator()->init(info);
-
     _output_nhwc.allocator()->allocate();
 
     // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
-    switch(weights->info()->num_dimensions())
-    {
-        case 3:
-        {
-            _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
-            break;
-        }
-        case 4:
-        {
-            _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Not supported.");
-            break;
-        }
-    }
-
+    _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
     _weights_hwio.allocator()->allocate();
 
     // configure the kernel to transform the input tensor from NCHW -> NHWC
     _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
-
     _input_nhwc.allocator()->allocate();
 
-    // Create Winograd operator object
-    _conv = support::cpp14::make_unique<Winograd3x3F32>(
-                in_shape.n_batches,
-                in_shape.n_channels,
-                in_shape.n_rows,
-                in_shape.n_cols,
-                out_channels,
-                false,
-                reinterpret_cast<const float *>(_weights_hwio.buffer()),
-                reinterpret_cast<float *>(_kernel_storage.buffer()),
-                reinterpret_cast<float *>(_input_nhwc.buffer()),
-                reinterpret_cast<float *>(_input_workspace.buffer()),
-                reinterpret_cast<float *>(_output_nhwc.buffer()),
-                reinterpret_cast<float *>(_output_workspace.buffer()));
+    const int         weights_width  = weights->info()->dimension(0);
+    const int         weights_height = weights->info()->dimension(1);
+    const KernelShape kernel_shape({ out_channels, weights_height, weights_width, in_channels });
 
-    // Configure the kernel, padding not needed so it's safe to call configure after allocare
-    _winograd_kernel.configure(_conv.get());
+    // Configure the InputTransform
+    const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
+    transform_input_kernel->configure(reinterpret_cast<float *>(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
+                                      reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_stride);
+
+    // Configure WeightsTransform
+    const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
+    transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast<float *>(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels);
+
+    // Configure OutputTransform
+    //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
+    const int  output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
+    const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
+
+    transform_output_kernel->configure(biases, reinterpret_cast<float *>(_output_workspace.buffer()),
+                                       output_matrix_stride, reinterpret_cast<float *>(_output_nhwc.buffer()),
+                                       in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
+
+    // Configure Batched GEMMs
+    const int      output_tile_rows         = batched_gemm_kernel->get_output_tile_rows();
+    const int      output_tile_cols         = batched_gemm_kernel->get_output_tile_cols();
+    const int      n_block                  = batched_gemm_kernel->get_number_blocks();
+    const int      tile_rows                = iceildiv(output_shape.n_rows, output_tile_rows);
+    const int      tile_cols                = iceildiv(output_shape.n_cols, output_tile_cols);
+    const int      m                        = in_shape.n_batches * tile_rows * tile_cols;
+    const int      k                        = in_shape.n_channels;
+    const int      n                        = out_channels;
+    const int      input_matrix_row_stride  = in_shape.n_channels;
+    const int      kernel_matrix_row_stride = roundup(out_channels, n_block);
+    const int      output_matrix_row_stride = kernel_matrix_row_stride;
+    const unsigned n_gemms                  = batched_gemm_kernel->get_number_gemms();
+
+    batched_gemm_kernel->configure(n_gemms, m, k, n,
+                                   input_matrix_stride, input_matrix_row_stride,
+                                   kernel_matrix_stride, kernel_matrix_row_stride,
+                                   output_matrix_stride, output_matrix_row_stride,
+                                   reinterpret_cast<float *>(_input_workspace.buffer()),
+                                   reinterpret_cast<float *>(_kernel_storage.buffer()),
+                                   reinterpret_cast<float *>(_output_workspace.buffer()));
 
     // Reorder the convoluted output to ACL's ordering NCHW
     _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
 
+    _transform_input_kernel   = std::move(transform_input_kernel);
+    _transform_weights_kernel = std::move(transform_weights_kernel);
+    _transform_output_kernel  = std::move(transform_output_kernel);
+    _batched_gemm_kernel      = std::move(batched_gemm_kernel);
 }
 
 void NEWinogradLayer::run()
@@ -161,18 +226,31 @@
     {
         _reshaped_kernel = true;
         _permute_weights.run();
-        _conv->transform_weights();
+        NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
     }
     //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
     _permute_input.run();
+
     // Transform input tensor to the winograd domain
-    _conv->transform_input();
+    NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
+
     //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
-    NEScheduler::get().schedule(&_winograd_kernel, Window::DimX);
+    NEScheduler::get().schedule(_batched_gemm_kernel.get(), Window::DimX);
+
     // Transform output tensor to the spatial domain
-    _conv->transform_output();
+    NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
+
     // Reorder the convoluted output to ACL's ordering NCHW
     _permute_output.run();
     _memory_group.release();
 }
+
+Status NEWinogradLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, biases, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(validate_arguments(input, weights, biases, output, conv_info));
+
+    return Status{};
+}
+
 } // namespace arm_compute

diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
index 505c4a3..8925acf 100644
--- a/src/runtime/Scheduler.cpp
+++ b/src/runtime/Scheduler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -142,8 +142,8 @@
 
 std::shared_ptr<IScheduler> Scheduler::_custom_scheduler = nullptr;
 
-void Scheduler::set(std::shared_ptr<IScheduler> &scheduler)
+void Scheduler::set(std::shared_ptr<IScheduler> scheduler)
 {
-    _custom_scheduler = scheduler;
+    _custom_scheduler = std::move(scheduler);
     set(Type::CUSTOM);
 }
commit	06ea048f062a50404b1b3998a61a45449c2d1f0f	[log] [tgz]
author	Anthony Barbier <anthony.barbier@arm.com>	Thu Feb 22 15:45:35 2018 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	Fri Feb 23 11:49:54 2018 +0000
tree	aa0dea3b0c49422538df9a5a02578b2c29e6fa67
parent	292227986edb37b01061afcad6df18ba9d6ccbeb [diff]