arm_compute v18.05

commit: b3a371bc429d2ba45e56baaf239d8200c2662a74 [log] [tgz]
author: Jenkins <bsgcomp@arm.com> Wed May 23 11:36:53 2018 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> Wed May 23 14:55:11 2018 +0100
tree: 554525e415c303d64a08722a755397852ebbb8e4
parent: 67c8c91522e5be8156b77f57e63c0253535c902a [diff]
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 54e3e52..cda29d6 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,47 +27,8 @@
 #include "arm_compute/core/Log.h"
 #include "arm_compute/core/Types.h"
 
-#include <map>
-#include <regex>
 #include <vector>
 
-namespace
-{
-arm_compute::GPUTarget get_bifrost_target(const std::string &version)
-{
-    if(version == "70")
-    {
-        return arm_compute::GPUTarget::G70;
-    }
-    else
-    {
-        return arm_compute::GPUTarget::BIFROST;
-    }
-}
-
-arm_compute::GPUTarget get_midgard_target(const std::string &version)
-{
-    switch(version[0])
-    {
-        case '6':
-            return arm_compute::GPUTarget::T600;
-        case '7':
-            return arm_compute::GPUTarget::T700;
-        case '8':
-            return arm_compute::GPUTarget::T800;
-        default:
-            return arm_compute::GPUTarget::MIDGARD;
-    }
-}
-
-bool extension_support(const cl::Device &device, const char *extension_name)
-{
-    std::string extensions = device.getInfo<CL_DEVICE_EXTENSIONS>();
-    auto        pos        = extensions.find(extension_name);
-    return (pos != std::string::npos);
-}
-} // namespace
-
 namespace arm_compute
 {
 std::string get_cl_type_from_data_type(const DataType &dt)
@@ -150,94 +111,27 @@
     }
 }
 
-const std::string &string_from_target(GPUTarget target)
-{
-    static std::map<GPUTarget, const std::string> gpu_target_map =
-    {
-        { GPUTarget::MIDGARD, "midgard" },
-        { GPUTarget::BIFROST, "bifrost" },
-        { GPUTarget::T600, "t600" },
-        { GPUTarget::T700, "t700" },
-        { GPUTarget::T800, "t800" },
-        { GPUTarget::G70, "g70" }
-    };
-
-    return gpu_target_map[target];
-}
-
 GPUTarget get_target_from_device(cl::Device &device)
 {
-    size_t name_size = 0;
-
     // Query device name size
-    cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, 0, nullptr, &name_size);
-    ARM_COMPUTE_ERROR_ON_MSG((err != 0) || (name_size == 0), "clGetDeviceInfo failed to return valid information");
-    ARM_COMPUTE_UNUSED(err);
+    std::string device_name = device.getInfo<CL_DEVICE_NAME>();
 
-    std::vector<char> name_buffer(name_size);
-
-    // Query device name
-    err = clGetDeviceInfo(device.get(), CL_DEVICE_NAME, name_size, name_buffer.data(), nullptr);
-    ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
-    ARM_COMPUTE_UNUSED(err);
-
-    std::regex  mali_regex(R"(Mali-([TG])(\d+))");
-    std::string device_name(name_buffer.begin(), name_buffer.end());
-    std::smatch name_parts;
-    const bool  found_mali = std::regex_search(device_name, name_parts, mali_regex);
-
-    if(!found_mali)
-    {
-        ARM_COMPUTE_LOG_INFO_MSG_CORE("Can't find valid Mali GPU. Target is set to MIDGARD.");
-        return GPUTarget::MIDGARD;
-    }
-
-    const char         target  = name_parts.str(1)[0];
-    const std::string &version = name_parts.str(2);
-
-    switch(target)
-    {
-        case 'T':
-            return get_midgard_target(version);
-        case 'G':
-            return get_bifrost_target(version);
-        default:
-            ARM_COMPUTE_LOG_INFO_MSG_CORE("Mali GPU unknown. Target is set to the default one.");
-            return GPUTarget::MIDGARD;
-    }
+    return get_target_from_name(device_name);
 }
 
-GPUTarget get_arch_from_target(GPUTarget target)
+bool arm_non_uniform_workgroup_supported(const cl::Device &device)
 {
-    return (target & GPUTarget::GPU_ARCH_MASK);
+    return device_supports_extension(device, "cl_arm_non_uniform_work_group_size");
 }
 
-bool non_uniform_workgroup_support(const cl::Device &device)
+bool fp16_supported(const cl::Device &device)
 {
-    return extension_support(device, "cl_arm_non_uniform_work_group_size");
-}
-
-bool fp16_support(const cl::Device &device)
-{
-    return extension_support(device, "cl_khr_fp16");
+    return device_supports_extension(device, "cl_khr_fp16");
 }
 
 CLVersion get_cl_version(const cl::Device &device)
 {
-    std::vector<char> version;
-    size_t            version_size = 0;
-    cl_int            err          = clGetDeviceInfo(device.get(), CL_DEVICE_VERSION, 0, nullptr, &version_size);
-    ARM_COMPUTE_ERROR_ON_MSG((err != 0) || (version_size == 0), "clGetDeviceInfo failed to return valid information");
-    ARM_COMPUTE_UNUSED(err);
-
-    // Resize vector
-    version.resize(version_size);
-    // Query version
-    err = clGetDeviceInfo(device.get(), CL_DEVICE_VERSION, version_size, version.data(), nullptr);
-    ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
-    ARM_COMPUTE_UNUSED(err);
-
-    std::string version_str(version.begin(), version.end());
+    std::string version_str = device.getInfo<CL_DEVICE_VERSION>();
     if(version_str.find("OpenCL 2") != std::string::npos)
     {
         return CLVersion::CL20;
@@ -258,4 +152,11 @@
     return CLVersion::UNKNOWN;
 }
 
+bool device_supports_extension(const cl::Device &device, const char *extension_name)
+{
+    std::string extensions = device.getInfo<CL_DEVICE_EXTENSIONS>();
+    auto        pos        = extensions.find(extension_name);
+    return (pos != std::string::npos);
+}
+
 } // namespace arm_compute

diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index c7c08d4..bdb26f8 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp

@@ -151,7 +151,8 @@
     { "activation_layer_qa8", "activation_layer_qa8.cl" },
     { "arithmetic_add", "arithmetic_op.cl" },
     { "arithmetic_sub", "arithmetic_op.cl" },
-    { "batchnormalization_layer", "batchnormalization_layer.cl" },
+    { "batchnormalization_layer_nchw", "batchnormalization_layer.cl" },
+    { "batchnormalization_layer_nhwc", "batchnormalization_layer.cl" },
     { "bitwise_or", "bitwise_op.cl" },
     { "bitwise_and", "bitwise_op.cl" },
     { "bitwise_xor", "bitwise_op.cl" },
@@ -161,6 +162,7 @@
     { "channel_combine_RGBA8888", "channel_combine.cl" },
     { "channel_combine_UYVY422", "channel_combine.cl" },
     { "channel_combine_YUYV422", "channel_combine.cl" },
+    { "channel_shuffle_nchw", "channel_shuffle.cl" },
     { "channel_extract_NV12", "channel_extract.cl" },
     { "channel_extract_NV21", "channel_extract.cl" },
     { "channel_extract_RGB888", "channel_extract.cl" },
@@ -170,8 +172,12 @@
     { "combine_gradients_L1", "canny.cl" },
     { "combine_gradients_L2", "canny.cl" },
     { "concatenate_depth", "concatenate.cl" },
+    { "concatenate_width", "concatenate.cl" },
     { "convolution_rectangle", "convolution_rectangle.cl" },
     { "col2im", "col2im.cl" },
+    { "convert_depth_down", "depth_convert.cl" },
+    { "convert_depth_up", "depth_convert.cl" },
+    { "convert_fc_weights", "convert_fc_weights.cl" },
     { "convolution3x3_static", "convolution3x3.cl" },
     { "convolution5x5_static", "convolution5x5.cl" },
     { "convolution7x7_static", "convolution7x7.cl" },
@@ -182,17 +188,20 @@
     { "convolution_separable7x1_static", "convolution7x7.cl" },
     { "convolution_separable1x9_static", "convolution9x9.cl" },
     { "convolution_separable9x1_static", "convolution9x9.cl" },
-    { "convert_depth_down", "depth_convert.cl" },
-    { "convert_depth_up", "depth_convert.cl" },
+    { "copy_tensor", "copy_tensor.cl" },
     { "copy_plane", "channel_extract.cl" },
     { "copy_planes_3p", "channel_combine.cl" },
     { "copy_to_keypoint", "fast_corners.cl" },
     { "deconvolution_upsample", "deconvolution_layer.cl" },
     { "depthwise_convolution_3x3", "depthwise_convolution.cl" },
     { "depthwise_convolution_3x3_f16", "depthwise_convolution.cl" },
-    { "depthwise_convolution_3x3_quantized", "depthwise_convolution_quantized.cl" },
-    { "depthwise_convolution_3x3_stridex1_stridey1_bifrost", "depthwise_convolution.cl" },
-    { "depthwise_convolution_3x3_stridex2_stridey2_bifrost", "depthwise_convolution.cl" },
+    { "depthwise_convolution_3x3_quantized_nchw", "depthwise_convolution_quantized.cl" },
+    { "depthwise_convolution_3x3_quantized_nhwc_stride1", "depthwise_convolution_quantized.cl" },
+    { "depthwise_convolution_3x3_quantized_nhwc_stride2", "depthwise_convolution_quantized.cl" },
+    { "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16", "depthwise_convolution.cl" },
+    { "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16", "depthwise_convolution.cl" },
+    { "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32", "depthwise_convolution.cl" },
+    { "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32", "depthwise_convolution.cl" },
     { "depthwise_im2col", "depthwise_convolution.cl" },
     { "depthwise_vector_to_tensor", "depthwise_convolution.cl" },
     { "depthwise_weights_reshape", "depthwise_convolution.cl" },
@@ -223,11 +232,13 @@
     { "gemm_mv", "gemv.cl" },
     { "gemm_mv_quantized", "gemv.cl" },
     { "gemm_mm_interleaved_transposed_f16", "gemm.cl" },
-    { "gemm_mm_interleaved_transposed_f32_midgard", "gemm.cl" },
+    { "gemm_mm_interleaved_transposed_f16_bifrost", "gemm.cl" },
+    { "gemm_mm_interleaved_transposed_f32", "gemm.cl" },
     { "gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl" },
     { "gemm_mm_interleaved_transposed_qs8", "gemm.cl" },
     { "gemm_mm_interleaved_transposed_qs16", "gemm.cl" },
     { "gemm_mm_floating_point", "gemm.cl" },
+    { "gemm_mm_floating_point_f16_bifrost", "gemm.cl" },
     { "gemm_mm_floating_point_f32_bifrost", "gemm.cl" },
     { "gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl" },
     { "gemm_mm_qs8", "gemm.cl" },
@@ -306,8 +317,10 @@
     { "pooling_layer_3", "pooling_layer.cl" },
     { "pooling_layer_optimized_3", "pooling_layer.cl" },
     { "pooling_layer_7", "pooling_layer.cl" },
-    { "pooling_layer_MxN", "pooling_layer.cl" },
-    { "pooling_layer_MxN_quantized", "pooling_layer_quantized.cl" },
+    { "pooling_layer_MxN_nchw", "pooling_layer.cl" },
+    { "pooling_layer_MxN_nhwc", "pooling_layer.cl" },
+    { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" },
+    { "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" },
     { "quantization_layer", "quantization_layer.cl" },
     { "reduction_operation", "reduction_operation.cl" },
     { "remap_nearest_neighbour", "remap.cl" },
@@ -351,6 +364,16 @@
     { "warp_affine_bilinear", "warp_affine.cl" },
     { "warp_perspective_nearest_neighbour", "warp_perspective.cl" },
     { "warp_perspective_bilinear", "warp_perspective.cl" },
+    { "winograd_filter_transform_2x2_3x3_nchw", "winograd.cl" },
+    { "winograd_filter_transform_4x4_3x3_nchw", "winograd.cl" },
+    { "winograd_filter_transform_4x4_5x5_nchw", "winograd.cl" },
+    { "winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd.cl" },
+    { "winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd.cl" },
+    { "winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd.cl" },
+    { "winograd_input_transform_4x4_3x3_stepz1_nchw", "winograd.cl" },
+    { "winograd_output_transform_2x2_3x3_nchw", "winograd.cl" },
+    { "winograd_output_transform_4x4_3x3_nchw", "winograd.cl" },
+    { "winograd_output_transform_4x4_5x5_nchw", "winograd.cl" },
     { "YUYV422_to_IYUV_bt709", "color_convert.cl" },
     { "YUYV422_to_NV12_bt709", "color_convert.cl" },
     { "YUYV422_to_RGB888_bt709", "color_convert.cl" },
@@ -397,6 +420,10 @@
 #include "./cl_kernels/channel_extract.clembed"
     },
     {
+        "channel_shuffle.cl",
+#include "./cl_kernels/channel_shuffle.clembed"
+    },
+    {
         "col2im.cl",
 #include "./cl_kernels/col2im.clembed"
     },
@@ -409,6 +436,10 @@
 #include "./cl_kernels/color_convert.clembed"
     },
     {
+        "convert_fc_weights.cl",
+#include "./cl_kernels/convert_fc_weights.clembed"
+    },
+    {
         "convolution3x3.cl",
 #include "./cl_kernels/convolution3x3.clembed"
     },
@@ -433,6 +464,10 @@
 #include "./cl_kernels/convolution_rectangle.clembed"
     },
     {
+        "copy_tensor.cl",
+#include "./cl_kernels/copy_tensor.clembed"
+    },
+    {
         "deconvolution_layer.cl",
 #include "./cl_kernels/deconvolution_layer.clembed"
     },
@@ -676,12 +711,17 @@
         "warp_perspective.cl",
 #include "./cl_kernels/warp_perspective.clembed"
     },
+    {
+        "winograd.cl",
+#include "./cl_kernels/winograd.clembed"
+    },
 #endif /* EMBEDDED_KERNELS */
 };
 
 CLKernelLibrary::CLKernelLibrary()
     : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
 {
+    opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the CLKernelLibrary is built
 }
 
 CLKernelLibrary &CLKernelLibrary::get()
@@ -699,22 +739,21 @@
     {
         ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
     }
-
     std::string concat_str;
 
-    if(fp16_support(_device))
+    if(fp16_supported(_device))
     {
         concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
     }
 
-    if(non_uniform_workgroup_support(_device))
-    {
-        concat_str += " -cl-arm-non-uniform-work-group-size ";
-    }
-    else if(get_cl_version(_device) == CLVersion::CL20)
+    if(get_cl_version(_device) == CLVersion::CL20)
     {
         concat_str += " -cl-std=CL2.0 ";
     }
+    else if(arm_non_uniform_workgroup_supported(_device))
+    {
+        concat_str += " -cl-arm-non-uniform-work-group-size ";
+    }
     else
     {
         ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
@@ -750,6 +789,11 @@
     return Kernel(kernel_name, cl_program);
 }
 
+void CLKernelLibrary::add_built_program(const std::string &built_program_name, cl::Program program)
+{
+    _built_programs_map.emplace(built_program_name, program);
+}
+
 const Program &CLKernelLibrary::load_program(const std::string &program_name) const
 {
     const auto program_it = _programs_map.find(program_name);
@@ -838,5 +882,26 @@
 
 cl::NDRange CLKernelLibrary::default_ndrange() const
 {
-    return cl::NDRange(128u, 1);
+    cl::Device  device  = cl::Device::getDefault();
+    GPUTarget   _target = get_target_from_device(device);
+    cl::NDRange default_range;
+
+    switch(_target)
+    {
+        case GPUTarget::MIDGARD:
+        case GPUTarget::T600:
+        case GPUTarget::T700:
+        case GPUTarget::T800:
+            default_range = cl::NDRange(128u, 1);
+            break;
+        default:
+            default_range = cl::NullRange;
+    }
+
+    return default_range;
+}
+
+std::string CLKernelLibrary::get_device_version()
+{
+    return _device.getInfo<CL_DEVICE_VERSION>();
 }

diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index f75a90a..a8ed973 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp

@@ -111,6 +111,12 @@
     LOAD_FUNCTION_PTR(clGetCommandQueueInfo, handle);
     LOAD_FUNCTION_PTR(clGetKernelInfo, handle);
     LOAD_FUNCTION_PTR(clGetEventProfilingInfo, handle);
+    LOAD_FUNCTION_PTR(clSVMAlloc, handle);
+    LOAD_FUNCTION_PTR(clSVMFree, handle);
+    LOAD_FUNCTION_PTR(clEnqueueSVMMap, handle);
+    LOAD_FUNCTION_PTR(clEnqueueSVMUnmap, handle);
+    LOAD_FUNCTION_PTR(clEnqueueMarker, handle);
+    LOAD_FUNCTION_PTR(clWaitForEvents, handle);
 
 #undef LOAD_FUNCTION_PTR
 
@@ -129,6 +135,90 @@
 }
 } // namespace arm_compute
 
+cl_int clEnqueueMarker(cl_command_queue command_queue,
+                       cl_event        *event)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clEnqueueMarker_ptr;
+    if(func != nullptr)
+    {
+        return func(command_queue, event);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clWaitForEvents(cl_uint         num_events,
+                       const cl_event *event_list)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clWaitForEvents_ptr;
+    if(func != nullptr)
+    {
+        return func(num_events, event_list);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_map_flags flags, void *svm_ptr,
+                       size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clEnqueueSVMMap_ptr;
+    if(func != nullptr)
+    {
+        return func(command_queue, blocking_map, flags, svm_ptr, size, num_events_in_wait_list, event_wait_list, event);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *svm_ptr, cl_uint num_events_in_wait_list,
+                         const cl_event *event_wait_list, cl_event *event)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clEnqueueSVMUnmap_ptr;
+    if(func != nullptr)
+    {
+        return func(command_queue, svm_ptr, num_events_in_wait_list, event_wait_list, event);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+void *clSVMAlloc(cl_context context, cl_svm_mem_flags_arm flags, size_t size, cl_uint alignment)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clSVMAlloc_ptr;
+    if(func != nullptr)
+    {
+        return func(context, flags, size, alignment);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+void clSVMFree(cl_context context, void *svm_pointer)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clSVMFree_ptr;
+    if(func != nullptr)
+    {
+        func(context, svm_pointer);
+    }
+}
+
 cl_int clGetContextInfo(cl_context      context,
                         cl_context_info param_name,
                         size_t          param_value_size,

diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
index 4424a66..a8ea738 100644
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ b/src/core/CL/cl_kernels/activation_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -115,6 +115,8 @@
 #define ACTIVATION_OP2(op, x) op##_op(x)
 #define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
 
+#if defined(ACT)
+
 /** This performs an activation function floating point inputs.
  *
  * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
@@ -168,3 +170,5 @@
     VSTORE(VEC_SIZE)
     (data, 0, (__global DATA_TYPE *)output.ptr);
 }
+
+#endif /* defined(ACT) */
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/activation_layer_qa8.cl b/src/core/CL/cl_kernels/activation_layer_qa8.cl
index cb31e99..66e54ed 100644
--- a/src/core/CL/cl_kernels/activation_layer_qa8.cl
+++ b/src/core/CL/cl_kernels/activation_layer_qa8.cl

@@ -44,6 +44,26 @@
 #define ACTIVATION_OP2(op, x) op##_op(x)
 #define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
 
+#if defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL)
+#define PERFORM_ACTIVATION_QA8(act, data)                                                         \
+    ({                                                                                            \
+        data = ACTIVATION_OP(act, data);                                                          \
+        \
+        VEC_DATA_TYPE(float, VEC_SIZE)                                                            \
+        fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));                                    \
+        \
+        fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \
+        data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(uchar, VEC_SIZE));                               \
+    })
+#else /* defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL) */
+#define PERFORM_ACTIVATION_QA8(act, data) \
+    ({                                    \
+        data = ACTIVATION_OP(act, data);  \
+    })
+#endif /* defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL) */
+
+#if defined(ACT)
+
 /** This performs an activation function on QASYMM8 inputs.
  *
  * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
@@ -92,19 +112,11 @@
     // Load data
     TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
 
-    // Perform activation
-    data = ACTIVATION_OP(ACT, data);
-
-#if defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL)
-    // requantize to output space
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));
-
-    fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL);
-    data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(uchar, VEC_SIZE));
-#endif // defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL)
+    data = PERFORM_ACTIVATION_QA8(ACT, data);
 
     // Store result
     VSTORE(VEC_SIZE)
     (data, 0, (__global DATA_TYPE *)output.ptr);
 }
+
+#endif /* defined(ACT) */
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index 5ddeb1a..9c980da 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl

@@ -44,15 +44,12 @@
 
 #endif /* FIXED_POINT_POSITION */
 
-#if defined(LU_BRELU)
-#define ACTIVATION_FUNC(x) CLAMP(x, (DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL)
-#elif defined(BRELU)
-#define ACTIVATION_FUNC(x) CLAMP(x, (DATA_TYPE)0, (DATA_TYPE)A_VAL)
-#elif defined(RELU)
-#define ACTIVATION_FUNC(x) max(x, (DATA_TYPE)0)
-#else /* FUSED_ACT */
+#if defined(FUSED_ACTIVATION)
+#include "activation_layer.cl"
+#define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x)
+#else /* defined(FUSED_ACTIVATION) */
 #define ACTIVATION_FUNC(x) (x)
-#endif /* FUSED_ACT */
+#endif /* defined(FUSED_ACTIVATION) */
 
 /** Apply batch normalization.
  *
@@ -90,15 +87,19 @@
  * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
  * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
  */
-__kernel void batchnormalization_layer(TENSOR3D_DECLARATION(input),
+__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
 #ifndef IN_PLACE
-                                       TENSOR3D_DECLARATION(output),
+                                            TENSOR3D_DECLARATION(output),
 #endif /* not IN_PLACE */
-                                       VECTOR_DECLARATION(mean),
-                                       VECTOR_DECLARATION(var),
-                                       VECTOR_DECLARATION(beta),
-                                       VECTOR_DECLARATION(gamma),
-                                       float epsilon)
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+                                            VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+                                            VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+                                            float epsilon)
 {
     Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
 #ifdef IN_PLACE
@@ -106,10 +107,14 @@
 #else  /* IN_PLACE */
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
 #endif /* IN_PLACE */
-    Vector mean  = CONVERT_TO_VECTOR_STRUCT(mean);
-    Vector var   = CONVERT_TO_VECTOR_STRUCT(var);
-    Vector beta  = CONVERT_TO_VECTOR_STRUCT(beta);
+    Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector var  = CONVERT_TO_VECTOR_STRUCT(var);
+#ifndef USE_DEFAULT_BETA
+    Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
     Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+#endif /* USE_DEFAULT_GAMMA */
 
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     data = 0;
@@ -120,9 +125,7 @@
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     x_bar = 0;
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    gamma_vec = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_vec = 0;
+    res = 0;
 
     const int current_slice = get_global_id(2);
 
@@ -135,11 +138,22 @@
     numerator = SUB_OP(data, numerator);
     x_bar     = MUL_OP(numerator, denominator);
 
-    gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
-    beta_vec  = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
-
+#ifndef USE_DEFAULT_GAMMA
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res = ADD_OP(MUL_OP(gamma_vec, x_bar), beta_vec);
+    gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
+
+    res = MUL_OP(gamma_vec, x_bar);
+#else  /* USE_DEFAULT_GAMMA */
+    // gamma is equal to 1, no need to perform multiplications
+    res          = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
+    // beta is not zero, hence we need to perform the addition
+    res = ADD_OP(res, beta_vec);
+#endif /* USE_DEFAULT_BETA */
 
     res = ACTIVATION_FUNC(res);
 
@@ -147,4 +161,113 @@
     (res, 0, (__global DATA_TYPE *)out.ptr);
 }
 
-#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) */
\ No newline at end of file
+/** Apply batch normalization on tensors with NHWC format.
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
+                                            TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+                                            VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+                                            VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+                                            float epsilon)
+{
+    Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D out = in;
+#else  /* IN_PLACE */
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+    Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector var  = CONVERT_TO_VECTOR_STRUCT(var);
+#ifndef USE_DEFAULT_BETA
+    Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+    Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+#endif /* USE_DEFAULT_GAMMA */
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    denominator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    numerator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    x_bar = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res = 0;
+
+    const int current_slice = get_global_id(0);
+
+    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+    denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(var.ptr + current_slice * VEC_SIZE * var.stride_x));
+    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
+
+    // Calculate x bar and store results
+    numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(mean.ptr + current_slice * VEC_SIZE * mean.stride_x));
+    numerator = SUB_OP(data, numerator);
+    x_bar     = MUL_OP(numerator, denominator);
+
+#ifndef USE_DEFAULT_GAMMA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(gamma.ptr + current_slice * VEC_SIZE * gamma.stride_x));
+
+    res = MUL_OP(gamma_vec, x_bar);
+#else  /* USE_DEFAULT_GAMMA */
+    // gamma is equal to 1, no need to perform multiplications
+    res = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(beta.ptr + current_slice * VEC_SIZE * beta.stride_x));
+    // beta is not zero, hence we need to perform the addition
+    res = ADD_OP(res, beta_vec);
+#endif /* USE_DEFAULT_BETA */
+
+    res = ACTIVATION_FUNC(res);
+
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) */

diff --git a/src/core/CL/cl_kernels/channel_combine.cl b/src/core/CL/cl_kernels/channel_combine.cl
index d309812..4207414 100644
--- a/src/core/CL/cl_kernels/channel_combine.cl
+++ b/src/core/CL/cl_kernels/channel_combine.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -338,9 +338,9 @@
     uchar8 data2 = vload8(0, src_plane2.ptr);
 
 #ifdef NV12
-    vstore16(shuffle2(data1, data2, (uchar16)(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)), 0, dst_plane1.ptr);
+    vstore16(shuffle2(data1, data2, (uchar16)(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)), 0, dst_plane1.ptr);
 #elif defined(NV21)
-    vstore16(shuffle2(data2, data1, (uchar16)(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)), 0, dst_plane1.ptr);
+    vstore16(shuffle2(data2, data1, (uchar16)(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)), 0, dst_plane1.ptr);
 #endif /* NV12 or NV21 */
 }
 

diff --git a/src/core/CL/cl_kernels/channel_shuffle.cl b/src/core/CL/cl_kernels/channel_shuffle.cl
new file mode 100644
index 0000000..26cee9c
--- /dev/null
+++ b/src/core/CL/cl_kernels/channel_shuffle.cl

@@ -0,0 +1,132 @@
+/*
+* Copyright (c) 2018 ARM Limited.
+*
+* SPDX-License-Identifier: MIT
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to
+* deal in the Software without restriction, including without limitation the
+* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+* sell copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in all
+* copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SIZE) && defined(NUM_GROUPS) && defined(K)
+
+// Check valid BLOCK_SIZES
+#if BLOCK_SIZE != 4 && BLOCK_SIZE != 8 && BLOCK_SIZE != 16
+#error "Only block sizes 4, 8 and 16 are supported"
+#endif /* BLOCK_SIZE != 4 && BLOCK_SIZE != 8 && BLOCK_SIZE != 16 */
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+
+/** Perfoms channel shuffle see https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note The number of groups should be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group should be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ *       K is equal to num_channels / num_groups.
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nchw(TENSOR3D_DECLARATION(src),
+                                   TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
+
+    const uint curr_channel = get_global_id(2); // channel id of input
+    const uint group_id     = curr_channel / K; // group id
+    const uint channel_id   = curr_channel % K; // channel id within the group
+
+    const uint x = get_global_id(0) * BLOCK_SIZE;
+    const uint y = get_global_id(1) * BLOCK_SIZE;
+    const uint z = channel_id * NUM_GROUPS + group_id;
+
+    // Load the NxN block
+    TYPE u0 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 0, 0));
+    TYPE u1 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 1, 0));
+    TYPE u2 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 2, 0));
+    TYPE u3 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 3, 0));
+#if BLOCK_SIZE > 4
+    TYPE u4 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 4, 0));
+    TYPE u5 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 5, 0));
+    TYPE u6 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 6, 0));
+    TYPE u7 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 7, 0));
+#if BLOCK_SIZE == 16
+    TYPE u8  = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 8, 0));
+    TYPE u9  = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 9, 0));
+    TYPE u10 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 10, 0));
+    TYPE u11 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 11, 0));
+    TYPE u12 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 12, 0));
+    TYPE u13 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 13, 0));
+    TYPE u14 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 14, 0));
+    TYPE u15 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 15, 0));
+#endif /* BLOCK_SIZE == 16 */
+#endif /* BLOCK_SIZE > 4 */
+
+    // Store blocks
+    VSTORE(BLOCK_SIZE)
+    (u0, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 0, z));
+    VSTORE(BLOCK_SIZE)
+    (u1, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 1, z));
+    VSTORE(BLOCK_SIZE)
+    (u2, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 2, z));
+    VSTORE(BLOCK_SIZE)
+    (u3, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 3, z));
+#if BLOCK_SIZE > 4
+    VSTORE(BLOCK_SIZE)
+    (u4, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 4, z));
+    VSTORE(BLOCK_SIZE)
+    (u5, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 5, z));
+    VSTORE(BLOCK_SIZE)
+    (u6, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 6, z));
+    VSTORE(BLOCK_SIZE)
+    (u7, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 7, z));
+#if BLOCK_SIZE == 16
+    VSTORE(BLOCK_SIZE)
+    (u8, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 8, z));
+    VSTORE(BLOCK_SIZE)
+    (u9, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 9, z));
+    VSTORE(BLOCK_SIZE)
+    (u10, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 10, z));
+    VSTORE(BLOCK_SIZE)
+    (u11, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 11, z));
+    VSTORE(BLOCK_SIZE)
+    (u12, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 12, z));
+    VSTORE(BLOCK_SIZE)
+    (u13, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 13, z));
+    VSTORE(BLOCK_SIZE)
+    (u14, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 14, z));
+    VSTORE(BLOCK_SIZE)
+    (u15, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 15, z));
+#endif /* BLOCK_SIZE == 16 */
+#endif /* BLOCK_SIZE > 4 */
+}
+#endif /* defined(DATA_TYPE) && defined(BLOCK_SIZE) && defined(NUM_GROUPS) && defined(K) */

diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index a92ab5b..f97ae13 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,41 @@
  */
 #include "helpers.h"
 
+/** This kernel concatenates the input tensor into the output tensor along the first dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8, QASYMM8, QS16, F16, F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  offset                            The offset to the first valid element of the output tensor in bytes
+ */
+__kernel void concatenate_width(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    int offset)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+
+    VSTORE(VEC_SIZE)
+    (source_values, 0, (__global DATA_TYPE *)(dst.ptr + offset));
+}
+
 /** This kernel concatenates the input tensor into the output tensor along the third dimension
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8, QS16, F16, F32

diff --git a/src/core/CL/cl_kernels/convert_fc_weights.cl b/src/core/CL/cl_kernels/convert_fc_weights.cl
new file mode 100644
index 0000000..3c3e8b0
--- /dev/null
+++ b/src/core/CL/cl_kernels/convert_fc_weights.cl

@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(FACTOR_1) && defined(FACTOR_2)
+/** Perform a NCHW -> NHWC or NHWC -> NCHW conversion for Fully Connected 2D weights.
+ *
+ * For NCHW -> NHWC, FACTOR_1 will be equal to the product of the first two dimensions of FullyConnectedLayer's input and FACTOR_2 will represent the number of channels of that tensor.
+ * For NHWC -> NCHW, FACTOR_1 and FACTOR_2 will hold the same values, but swapped.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Original input tensor width*height and depth should be given as a preprocessor argument using -DFACTOR_1=size and -DFACTOR_2=size for NCHW and vice versa for NHWC. e.g. -DFACTOR_1=256 and -DFACTOR_2=128
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8, S8, QS8, QASYMM8, U16, S16, QS16, U32, S32, QS32, F16, F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convert_fc_weights(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_x + (get_global_id(1) % FACTOR_1 * FACTOR_2 + get_global_id(1) / FACTOR_1) * dst_stride_y;
+
+    *((__global DATA_TYPE *)dst_addr) = *((__global DATA_TYPE *)src.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(FACTOR_1) && defined(FACTOR_2)

diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/copy_tensor.cl
new file mode 100644
index 0000000..4b37dec
--- /dev/null
+++ b/src/core/CL/cl_kernels/copy_tensor.cl

@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Performs a copy of input tensor to the output tensor.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ */
+__kernel void copy_tensor(
+    VECTOR_DECLARATION(in),
+    VECTOR_DECLARATION(out))
+{
+    Vector in  = CONVERT_TO_VECTOR_STRUCT(in);
+    Vector out = CONVERT_TO_VECTOR_STRUCT(out);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)in.ptr);
+
+    vstore16(data, 0, (__global DATA_TYPE *)out.ptr);
+}
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index f352138..5f4247e 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl

@@ -24,6 +24,7 @@
 
 #include "helpers.h"
 
+#if defined(DEPTH_MULTIPLIER)
 #if defined(CONV_STRIDE_X)
 
 #if CONV_STRIDE_X == 1
@@ -192,6 +193,8 @@
     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
 #endif //defined(HAS_BIAS)
 
+    src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+
     uchar3 offset          = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
     float3 weights_values0 = vload3(0, (__global float *)(weights.ptr + offset.s0));
     float3 weights_values1 = vload3(0, (__global float *)(weights.ptr + offset.s1));
@@ -218,6 +221,22 @@
         acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1);            \
     })
 
+#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \
+    ({                                                             \
+        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);            \
+        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);            \
+        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);            \
+        acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1);            \
+        acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1);            \
+        acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1);            \
+        acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2);            \
+        acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2);            \
+        acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2);            \
+        acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3);            \
+        acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3);            \
+        acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3);            \
+    })
+
 #define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \
     ({                                                                   \
         acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);                  \
@@ -228,6 +247,22 @@
         acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1);                  \
     })
 
+#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \
+    ({                                                                   \
+        acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0);                  \
+        acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0);                  \
+        acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0);                  \
+        acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1);                  \
+        acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1);                  \
+        acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1);                  \
+        acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2);                  \
+        acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2);                  \
+        acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2);                  \
+        acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3);                  \
+        acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3);                  \
+        acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3);                  \
+    })
+
 /** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
  * stride_x and stride_y are equal to 1
  *
@@ -260,7 +295,7 @@
  * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
  */
-__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost(
+__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
     TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(dst),
     TENSOR3D_DECLARATION(weights)
@@ -280,20 +315,20 @@
     float2 pixels3 = 0.0f;
 
     __global uchar *weights_addr = (__global uchar *)weights.ptr;
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+    __global uchar *src_addr     = src.ptr - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
 
     // Load the weights
     float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
     float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
     float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
 
-    // Note: Since each work-item computes 4x2 elements, we need to load 4 rows from the input tensor
+    // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
     float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0
     float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1
     float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2
     float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3
-    float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row3
-    float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row3
+    float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4
+    float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5
 
     CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);
     CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);
@@ -357,7 +392,7 @@
  * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
  */
-__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost(
+__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
     TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(dst),
     TENSOR3D_DECLARATION(weights)
@@ -375,7 +410,7 @@
     float2 pixels1 = 0.0f;
 
     __global uchar *weights_addr = (__global uchar *)weights.ptr;
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+    __global uchar *src_addr     = src.ptr - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
 
     // Load the weights
     float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
@@ -414,6 +449,8 @@
     vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
 }
 
+#endif // defined(DEPTH_MULTIPLIER)
+
 #if defined(SRC_WIDTH) && defined(DATA_TYPE)
 /** This kernel reshapes each of the tensor's low three dimensions to single rows.
  *
@@ -463,17 +500,17 @@
 #if defined(HAS_BIAS)
     if(get_global_id(1) == 0)
     {
-        *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x));
+        *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global DATA_TYPE *)(biases.ptr + get_global_id(2) * biases_stride_x));
     }
 #endif // defined(HAS_BIAS)
 }
 #endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
 
-#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE)
+#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
 /** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT
+ * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT, -DDEPTH_MULTIPLIER
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -502,7 +539,7 @@
 
     const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
     const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
-    const int src_z = get_global_id(2);
+    const int src_z = get_global_id(2) / DEPTH_MULTIPLIER;
 
     __global uchar *input_ptr      = src_ptr + src_offset_first_element_in_bytes + src_z * src_stride_z;
     __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));
@@ -526,7 +563,7 @@
 #endif // defined(HAS_BIAS)
 }
 
-#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE)
+#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)
 
 #if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
 
@@ -565,7 +602,7 @@
 
 #endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
 
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER)
 #if defined(CONV_STRIDE_X)
 #if CONV_STRIDE_X == 1
 #define convolution1x3_f16 convolution1x3_stride_1_f16
@@ -684,6 +721,8 @@
     return pixels;
 }
 
+#if defined(DEPTH_MULTIPLIER)
+
 /** This OpenCL kernel computes the depthwise convolution 3x3
  *
  * @param[in] src_ptr                               Pointer to the source image. Supported data types: F16
@@ -694,7 +733,7 @@
  * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
  * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
@@ -702,7 +741,7 @@
  * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F32
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
@@ -732,6 +771,8 @@
     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
 #endif //defined(HAS_BIAS)
 
+    src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+
     uchar3 offset         = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
     half3 weights_values0 = vload3(0, (__global half *)(weights.ptr + offset.s0));
     half3 weights_values1 = vload3(0, (__global half *)(weights.ptr + offset.s1));
@@ -746,5 +787,196 @@
 
     vstore4(pixels, 0, (__global half *)dst.ptr);
 }
+#endif // defined(DEPTH_MULTIPLIER)
 #endif // defined(CONV_STRIDE_X)
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
+ * when both stride_x and stride_y are equal to 1
+ *
+ * @param[in] src_ptr                               Pointer to the source image. Supported data types: F16
+ * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ */
+__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif //defined(HAS_BIAS)
+)
+{
+    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));
+#endif /* defined(HAS_BIAS) */
+
+    half4 pixels0 = 0.0f;
+    half4 pixels1 = 0.0f;
+    half4 pixels2 = 0.0f;
+    half4 pixels3 = 0.0f;
+
+    __global uchar *weights_addr = (__global uchar *)weights.ptr;
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0) - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+
+    // Load the weights
+    half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
+    half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
+    half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
+
+    // Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor
+    half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
+    half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
+    half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
+    half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
+    half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
+    half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5
+
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);
+
+#ifdef HAS_BIAS
+    pixels0 += (half4)bias;
+    pixels1 += (half4)bias;
+    pixels2 += (half4)bias;
+    pixels3 += (half4)bias;
+#endif /* defined(HAS_BIAS) */
+
+    vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
+    vstore4(pixels2, 0, (__global half *)(dst.ptr + 2 * dst_stride_y));
+    vstore4(pixels3, 0, (__global half *)(dst.ptr + 3 * dst_stride_y));
+}
+
+/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
+ * when both stride_x and stride_y are equal to 2
+ *
+ * @param[in] src_ptr                               Pointer to the source image. Supported data types: F16
+ * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ */
+__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif //defined(HAS_BIAS)
+)
+{
+    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));
+#endif /* defined(HAS_BIAS) */
+
+    half4 pixels0 = 0.0f;
+    half4 pixels1 = 0.0f;
+
+    __global uchar *weights_addr = (__global uchar *)weights.ptr;
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0) - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+
+    // Load the weights
+    half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
+    half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));
+    half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));
+
+    // Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor
+    half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
+    half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0
+    half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
+    half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1
+    half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
+    half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2
+    half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
+    half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3
+    half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
+    half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4
+
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);
+    CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);
+
+#ifdef HAS_BIAS
+    pixels0 += (half4)bias;
+    pixels1 += (half4)bias;
+#endif /* defined(HAS_BIAS) */
+
+    vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER)

diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 40538a1..ccb3a1f 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl

@@ -24,7 +24,20 @@
 
 #include "helpers_asymm.h"
 
-#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
+#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
+
+#if defined(FUSED_ACTIVATION)
+#define DATA_TYPE uchar
+#ifndef VEC_SIZE
+#define VEC_SIZE 8
+#endif /* VEC_SIZE */
+#include "activation_layer_qa8.cl"
+#define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QA8(FUSED_ACTIVATION, x)
+#else /* defined(FUSED_ACTIVATION) */
+#define ACTIVATION_FUNC(x) (x)
+#endif /* defined(FUSED_ACTIVATION) */
+
+#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X)
 
 #if CONV_STRIDE_X > 3
 #error "Stride X not supported"
@@ -62,7 +75,7 @@
     })
 #endif /* CONV_STRIDE_X */
 
-/** This function computes the horizontal integral of the image and adds offsets.
+/** This function computes the depthwise convolution quantized.
  *
  * @param[in] src_ptr                               Pointer to the source image. Supported data types: QASYMM8
  * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
@@ -94,7 +107,7 @@
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
  */
 
-__kernel void depthwise_convolution_3x3_quantized(
+__kernel void depthwise_convolution_3x3_quantized_nchw(
     TENSOR3D_DECLARATION(src),
     TENSOR3D_DECLARATION(dst),
     TENSOR3D_DECLARATION(weights)
@@ -113,6 +126,8 @@
     int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));
 #endif //defined(HAS_BIAS)
 
+    src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+
     uchar3 w0 = vload3(0, weights.ptr + 0 * weights_stride_y);
     uchar3 w1 = vload3(0, weights.ptr + 1 * weights_stride_y);
     uchar3 w2 = vload3(0, weights.ptr + 2 * weights_stride_y);
@@ -222,7 +237,7 @@
     res0        = max(res0, (uchar8)0);
     res0        = min(res0, (uchar8)255);
 
-    vstore8(res0, 0, dst.ptr);
+    vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
 #if CONV_STRIDE_Y == 1
 
     values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
@@ -231,8 +246,481 @@
     res1        = max(res1, (uchar8)0);
     res1        = min(res1, (uchar8)255);
 
-    vstore8(res1, 0, dst.ptr + dst_stride_y);
+    vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
 #endif /* CONV_STRIDE_Y == 1 */
 }
 
-#endif /* defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) */
+#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) */
+
+#if defined(VEC_SIZE) && defined(SRC_DEPTH) && defined(CONV_PAD_TOP) && defined(ROWS_READ)
+
+#define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)
+
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
+
+#define BIFROST_MAD_4(acc, x, y)               \
+    ({                                         \
+        acc.s0 += (ushort)x.s0 * (ushort)y.s0; \
+        acc.s1 += (ushort)x.s1 * (ushort)y.s1; \
+        acc.s2 += (ushort)x.s2 * (ushort)y.s2; \
+        acc.s3 += (ushort)x.s3 * (ushort)y.s3; \
+    })
+
+#if WEIGHTS_OFFSET != 0
+#define BIFROST_MAD_ACC_4(acc, sum, x, y) \
+    ({                                    \
+        sum += CONVERT(x, VEC_INT);       \
+        BIFROST_MAD_4(acc, x, y);         \
+    })
+#else /* WEIGHTS_OFFSET != 0 */
+#define BIFROST_MAD_ACC_4(acc, sum, x, y) BIFROST_MAD_4(acc, x, y)
+#endif /* WEIGHTS_OFFSET != 0 */
+
+/** This function computes the depthwise convolution quantized.
+ *
+ * @param[in] src_ptr                               Pointer to the source image. Supported data types: QASYMM8
+ * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: QASYMM8
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: QASYMM8
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ */
+
+__kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(biases)
+#endif /* defined(HAS_BIAS) */
+)
+{
+    Image  dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+#if defined(HAS_BIAS)
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+    VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
+#endif /* defined(HAS_BIAS) */
+
+    __global uchar *first_elem = src_ptr + src_offset_first_element_in_bytes;
+
+    const int z         = get_global_id(2);
+    const int pad_offs  = -ROWS_READ * src_stride_y;
+    const int src_offs0 = get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + z * src_step_z - CONV_PAD_TOP * src_stride_z;
+    const int src_offs1 = src_offs0 + src_stride_z;
+    const int src_offs2 = src_offs1 + src_stride_z;
+
+    const int cond_top    = z - CONV_PAD_TOP < 0;
+    const int cond_bottom = z * (src_step_z / src_stride_z) + 2 > SRC_DEPTH;
+
+    __global uchar *src_addr0 = first_elem + select(src_offs0, pad_offs, cond_top);
+    __global uchar *src_addr1 = first_elem + src_offs1;
+    __global uchar *src_addr2 = first_elem + select(src_offs2, pad_offs, cond_bottom);
+
+    VEC_INT sum_we = 0;
+    VEC_INT acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0;
+    VEC_INT sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
+
+    // z == 0
+    VEC_UCHAR w0, w1, w2;
+    w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
+    w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
+    w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
+
+#if INPUT_OFFSET != 0
+    sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
+#endif /* INPUT_OFFSET != 0 */
+
+    VEC_UCHAR values = VLOAD(VEC_SIZE)(0, src_addr0);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w0);
+
+    src_addr0 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr0);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w1);
+    BIFROST_MAD_ACC_4(acc1, sum1, values, w0);
+
+    src_addr0 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr0);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w2);
+    BIFROST_MAD_ACC_4(acc1, sum1, values, w1);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w0);
+
+    src_addr0 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr0);
+    BIFROST_MAD_ACC_4(acc1, sum1, values, w2);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w1);
+    BIFROST_MAD_ACC_4(acc3, sum3, values, w0);
+
+    src_addr0 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr0);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w2);
+    BIFROST_MAD_ACC_4(acc3, sum3, values, w1);
+
+    src_addr0 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr0);
+    BIFROST_MAD_ACC_4(acc3, sum3, values, w2);
+
+    weights.ptr += weights_stride_z;
+
+    // z == 1
+    w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
+    w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
+    w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
+
+#if INPUT_OFFSET != 0
+    sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
+#endif /* INPUT_OFFSET != 0 */
+
+    values = VLOAD(VEC_SIZE)(0, src_addr1);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w0);
+
+    src_addr1 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr1);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w1);
+    BIFROST_MAD_ACC_4(acc1, sum1, values, w0);
+
+    src_addr1 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr1);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w2);
+    BIFROST_MAD_ACC_4(acc1, sum1, values, w1);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w0);
+
+    src_addr1 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr1);
+    BIFROST_MAD_ACC_4(acc1, sum1, values, w2);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w1);
+    BIFROST_MAD_ACC_4(acc3, sum3, values, w0);
+
+    src_addr1 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr1);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w2);
+    BIFROST_MAD_ACC_4(acc3, sum3, values, w1);
+
+    src_addr1 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr1);
+    BIFROST_MAD_ACC_4(acc3, sum3, values, w2);
+
+    weights.ptr += weights_stride_z;
+
+    // z == 2
+    w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
+    w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
+    w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
+
+#if INPUT_OFFSET != 0
+    sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
+#endif /* INPUT_OFFSET != 0 */
+
+    values = VLOAD(VEC_SIZE)(0, src_addr2);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w0);
+
+    src_addr2 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr2);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w1);
+    BIFROST_MAD_ACC_4(acc1, sum1, values, w0);
+
+    src_addr2 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr2);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w2);
+    BIFROST_MAD_ACC_4(acc1, sum1, values, w1);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w0);
+
+    src_addr2 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr2);
+    BIFROST_MAD_ACC_4(acc1, sum1, values, w2);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w1);
+    BIFROST_MAD_ACC_4(acc3, sum3, values, w0);
+
+    src_addr2 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr2);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w2);
+    BIFROST_MAD_ACC_4(acc3, sum3, values, w1);
+
+    src_addr2 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr2);
+    BIFROST_MAD_ACC_4(acc3, sum3, values, w2);
+
+#if defined(HAS_BIAS)
+    acc0 += bias_values;
+    acc1 += bias_values;
+    acc2 += bias_values;
+    acc3 += bias_values;
+#endif /* defined(HAS_BIAS) */
+
+#if WEIGHTS_OFFSET != 0
+    acc0 += WEIGHTS_OFFSET * sum0;
+    acc1 += WEIGHTS_OFFSET * sum1;
+    acc2 += WEIGHTS_OFFSET * sum2;
+    acc3 += WEIGHTS_OFFSET * sum3;
+#endif /* WEIGHTS_OFFSET != 0 */
+
+#if INPUT_OFFSET != 0
+    VEC_INT offs = INPUT_OFFSET * sum_we;
+
+    acc0 += offs;
+    acc1 += offs;
+    acc2 += offs;
+    acc3 += offs;
+#endif /* INPUT_OFFSET != 0 */
+
+#if K_OFFSET != 0
+    acc0 += (VEC_INT)K_OFFSET;
+    acc1 += (VEC_INT)K_OFFSET;
+    acc2 += (VEC_INT)K_OFFSET;
+    acc3 += (VEC_INT)K_OFFSET;
+#endif /* K_OFFSET != 0 */
+
+    acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+
+    acc0 += (VEC_INT)OUTPUT_OFFSET;
+    acc1 += (VEC_INT)OUTPUT_OFFSET;
+    acc2 += (VEC_INT)OUTPUT_OFFSET;
+    acc3 += (VEC_INT)OUTPUT_OFFSET;
+
+    VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);
+    VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);
+    VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);
+    VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR);
+
+    res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);
+    res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);
+    res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);
+    res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);
+
+    VSTORE(VEC_SIZE)
+    (res0, 0, dst.ptr + 0 * dst_stride_y);
+    VSTORE(VEC_SIZE)
+    (res1, 0, dst.ptr + 1 * dst_stride_y);
+    VSTORE(VEC_SIZE)
+    (res2, 0, dst.ptr + 2 * dst_stride_y);
+    VSTORE(VEC_SIZE)
+    (res3, 0, dst.ptr + 3 * dst_stride_y);
+}
+
+/** This function computes the depthwise convolution quantized.
+ *
+ * @param[in] src_ptr                               Pointer to the source image. Supported data types: QASYMM8
+ * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: QASYMM8
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: QASYMM8
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ */
+
+__kernel void depthwise_convolution_3x3_quantized_nhwc_stride2(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(biases)
+#endif /* defined(HAS_BIAS) */
+)
+{
+    Image  dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
+#if defined(HAS_BIAS)
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+    VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);
+#endif /* defined(HAS_BIAS) */
+
+    __global uchar *first_elem = src_ptr + src_offset_first_element_in_bytes;
+
+    const int z         = get_global_id(2);
+    const int pad_offs  = -ROWS_READ * src_stride_y;
+    const int src_offs0 = get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + z * src_step_z - CONV_PAD_TOP * src_stride_z;
+    const int src_offs1 = src_offs0 + src_stride_z;
+    const int src_offs2 = src_offs1 + src_stride_z;
+
+    const int cond_top    = z - CONV_PAD_TOP < 0;
+    const int cond_bottom = z * (src_step_z / src_stride_z) + 2 > SRC_DEPTH;
+
+    __global uchar *src_addr0 = first_elem + select(src_offs0, pad_offs, cond_top);
+    __global uchar *src_addr1 = first_elem + src_offs1;
+    __global uchar *src_addr2 = first_elem + select(src_offs2, pad_offs, cond_bottom);
+
+    VEC_INT sum_we = 0;
+    VEC_INT acc0 = 0, acc2 = 0;
+    VEC_INT sum0 = 0, sum2 = 0;
+
+    // z == 0
+    VEC_UCHAR w0, w1, w2;
+    w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
+    w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
+    w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
+
+#if INPUT_OFFSET != 0
+    sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
+#endif /* INPUT_OFFSET != 0 */
+
+    VEC_UCHAR values = VLOAD(VEC_SIZE)(0, src_addr0);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w0);
+
+    src_addr0 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr0);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w1);
+
+    src_addr0 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr0);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w2);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w0);
+
+    src_addr0 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr0);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w1);
+
+    src_addr0 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr0);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w2);
+
+    weights.ptr += weights_stride_z;
+
+    // z == 1
+    w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
+    w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
+    w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
+
+#if INPUT_OFFSET != 0
+    sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
+#endif /* INPUT_OFFSET != 0 */
+
+    values = VLOAD(VEC_SIZE)(0, src_addr1);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w0);
+
+    src_addr1 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr1);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w1);
+
+    src_addr1 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr1);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w2);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w0);
+
+    src_addr1 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr1);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w1);
+
+    src_addr1 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr1);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w2);
+
+    weights.ptr += weights_stride_z;
+
+    // z == 2
+    w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y);
+    w1 = VLOAD(VEC_SIZE)(0, weights.ptr + 1 * weights_stride_y);
+    w2 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y);
+
+#if INPUT_OFFSET != 0
+    sum_we += CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT);
+#endif /* INPUT_OFFSET != 0 */
+
+    values = VLOAD(VEC_SIZE)(0, src_addr2);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w0);
+
+    src_addr2 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr2);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w1);
+
+    src_addr2 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr2);
+    BIFROST_MAD_ACC_4(acc0, sum0, values, w2);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w0);
+
+    src_addr2 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr2);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w1);
+
+    src_addr2 += src_stride_y;
+    values = VLOAD(VEC_SIZE)(0, src_addr2);
+    BIFROST_MAD_ACC_4(acc2, sum2, values, w2);
+
+#if defined(HAS_BIAS)
+    acc0 += bias_values;
+    acc2 += bias_values;
+#endif /* defined(HAS_BIAS) */
+
+#if WEIGHTS_OFFSET != 0
+    acc0 += WEIGHTS_OFFSET * sum0;
+    acc2 += WEIGHTS_OFFSET * sum2;
+#endif /* WEIGHTS_OFFSET != 0 */
+
+#if INPUT_OFFSET != 0
+    VEC_INT offs = INPUT_OFFSET * sum_we;
+
+    acc0 += offs;
+    acc2 += offs;
+#endif /* INPUT_OFFSET != 0 */
+
+#if K_OFFSET != 0
+    acc0 += (VEC_INT)K_OFFSET;
+    acc2 += (VEC_INT)K_OFFSET;
+#endif /* K_OFFSET != 0 */
+
+    acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc0 += (VEC_INT)OUTPUT_OFFSET;
+    acc2 += (VEC_INT)OUTPUT_OFFSET;
+    VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);
+    VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);
+    res0           = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);
+    res2           = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);
+
+    VSTORE(VEC_SIZE)
+    (res0, 0, dst.ptr + 0 * dst_stride_y);
+    VSTORE(VEC_SIZE)
+    (res2, 0, dst.ptr + 1 * dst_stride_y);
+}
+
+#endif /* defined(VEC_SIZE) && defined(SRC_DEPTH) && defined(CONV_PAD_TOP) && defined(ROWS_READ) */
+
+#endif /* defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) */

diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/fill_border.cl
index fbd4f6a..33a9495 100644
--- a/src/core/CL/cl_kernels/fill_border.cl
+++ b/src/core/CL/cl_kernels/fill_border.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,7 @@
  * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
  * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
  *
- * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: U8, U16, S16, U32, S32, F32
+ * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: U8/U16/S16/U32/S32/F16/F32
  * @param[in]     buf_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]     buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]     buf_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -110,7 +110,7 @@
  * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
  * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
  *
- * @param[out] buf_ptr                           Pointer to the source image. Supported data types: U8, U16, S16, U32, S32, F32
+ * @param[out] buf_ptr                           Pointer to the source image. Supported data types: U8/U16/S16/U32/S32/F16/F32
  * @param[in]  buf_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  buf_stride_y                      Stride of the source image in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 58a550f..9ed3af8 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl

@@ -49,27 +49,35 @@
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
  * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
  */
-__kernel void gemm_transpose1xW(IMAGE_DECLARATION(src),
-                                IMAGE_DECLARATION(dst))
+__kernel void gemm_transpose1xW(TENSOR3D_DECLARATION(src),
+                                TENSOR3D_DECLARATION(dst))
 {
     uint x = get_global_id(0);
     uint y = get_global_id(1);
+    uint z = get_global_id(2);
 
     // Compute address for Matrix B - source
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
 
     // Compute address for Matrix B transposed - destination. X and Y are swapped
     uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + y * TRANSPOSE_W * sizeof(DATA_TYPE) * MULT_TRANSPOSE1XW_WIDTH + (x / MULT_TRANSPOSE1XW_WIDTH) * dst_stride_y +
                              (x % MULT_TRANSPOSE1XW_WIDTH) * TRANSPOSE_W * sizeof(DATA_TYPE);
 
+    // Add offset for batched GEMM
+    dst_addr_in_bytes += z * dst_stride_z;
+
     VEC_DATA_TYPE(DATA_TYPE, TRANSPOSE_W)
     b0 = VLOAD(TRANSPOSE_W)(0, (__global DATA_TYPE *)src.ptr);
 
@@ -90,37 +98,47 @@
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
  * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
  */
-__kernel void gemm_interleave4x4(IMAGE_DECLARATION(src),
-                                 IMAGE_DECLARATION(dst))
+__kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src),
+                                 TENSOR3D_DECLARATION(dst))
 {
     // Compute source and destination addresses
     uint x = get_global_id(0);
     uint y = get_global_id(1);
+    uint z = get_global_id(2);
 
-    // Compute address for Matrix B - source
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    // Compute address for source tensor
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
 
     // Compute address for Matrix B transposed - destination. X and Y are swapped
     uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * 16 * MULT_INTERLEAVE4X4_HEIGHT + (y / MULT_INTERLEAVE4X4_HEIGHT) * dst_stride_y +
                              (y % MULT_INTERLEAVE4X4_HEIGHT) * 4 * sizeof(DATA_TYPE);
 
+    // Add offset for batched GEMM
+    dst_addr_in_bytes += z * dst_stride_z;
+
+    __global uchar *input_ptr = src.ptr;
+
     // Load values from Matrix A
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    a0 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 0)));
+    a0 = vload4(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    a1 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 1)));
+    a1 = vload4(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    a2 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 2)));
+    a2 = vload4(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    a3 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 3)));
+    a3 = vload4(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
 
     VEC_DATA_TYPE(DATA_TYPE, 4)
     val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0);
@@ -144,6 +162,8 @@
  * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -164,12 +184,16 @@
  * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
-__kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0),
-                                                         IMAGE_DECLARATION(src1),
-                                                         IMAGE_DECLARATION(dst))
+__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+                                                 IMAGE_DECLARATION(dst),
+                                                 uint src0_stride_z,
+                                                 uint src1_stride_z,
+                                                 uint dst_stride_z)
 {
     int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
     int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+    int z = get_global_id(2);
 
     // Offset
     const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
@@ -177,8 +201,18 @@
 
     // src_addr_a = address of matrix A
     // src_addr_b = address of matrix B
-    __global float *src_addr_a = (__global float *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
-    __global float *src_addr_b = (__global float *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
 
     // Compute end row address for matrix B
     __global float *src_end_addr_b = src_addr_b + COLS_B;
@@ -236,11 +270,17 @@
     c30 = c30 * (float4)ALPHA;
 #endif // defined(ALPHA)
 
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
     // Store 4x4 block
-    vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0)));
-    vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1)));
-    vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2)));
-    vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));
+    vstore4(c00, 0, (__global float *)(dst_addr + 0 * dst_stride_y));
+    vstore4(c10, 0, (__global float *)(dst_addr + 1 * dst_stride_y));
+    vstore4(c20, 0, (__global float *)(dst_addr + 2 * dst_stride_y));
+    vstore4(c30, 0, (__global float *)(dst_addr + 3 * dst_stride_y));
 }
 
 /** This OpenCL kernel is optimized for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
@@ -249,6 +289,9 @@
  * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -271,10 +314,14 @@
  */
 __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
                                                          IMAGE_DECLARATION(src1),
-                                                         IMAGE_DECLARATION(dst))
+                                                         IMAGE_DECLARATION(dst),
+                                                         uint src0_stride_z,
+                                                         uint src1_stride_z,
+                                                         uint dst_stride_z)
 {
     int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
     int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+    int z = get_global_id(2);
 
     // Offset
     const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
@@ -282,11 +329,18 @@
 
     // src_addr_a = address of matrix A
     // src_addr_b = address of matrix B
-    __global float *src_addr_a = (__global float *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
-    __global float *src_addr_b = (__global float *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
 
-    // Compute end row address for matrix B
-    __global float *src_end_addr_b = src_addr_b + COLS_B;
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
 
     src_addr_a += offset_row_a;
     src_addr_b += offset_row_b;
@@ -309,35 +363,17 @@
     float c32 = 0.0f;
     float c33 = 0.0f;
 
-    for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += (16 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (16 * MULT_TRANSPOSE1XW_WIDTH))
+#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))
+
+    int i = 0;
+    for(; i <= (int)(COLS_MTX_B - 4); i += 4)
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
         float4 a0 = vload4(0, src_addr_a);
         float4 b0 = vload4(0, src_addr_b);
 
-        c00 = fma(a0.s0, b0.s0, c00);
-        c01 = fma(a0.s0, b0.s1, c01);
-        c02 = fma(a0.s0, b0.s2, c02);
-        c03 = fma(a0.s0, b0.s3, c03);
-
-        c10 = fma(a0.s1, b0.s0, c10);
-        c11 = fma(a0.s1, b0.s1, c11);
-        c12 = fma(a0.s1, b0.s2, c12);
-        c13 = fma(a0.s1, b0.s3, c13);
-
-        c20 = fma(a0.s2, b0.s0, c20);
-        c21 = fma(a0.s2, b0.s1, c21);
-        c22 = fma(a0.s2, b0.s2, c22);
-        c23 = fma(a0.s2, b0.s3, c23);
-
-        c30 = fma(a0.s3, b0.s0, c30);
-        c31 = fma(a0.s3, b0.s1, c31);
-        c32 = fma(a0.s3, b0.s2, c32);
-        c33 = fma(a0.s3, b0.s3, c33);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
-        b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
 
         c00 = fma(a0.s0, b0.s0, c00);
         c01 = fma(a0.s0, b0.s1, c01);
@@ -360,8 +396,11 @@
         c33 = fma(a0.s3, b0.s3, c33);
 
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a + 8 * MULT_INTERLEAVE4X4_HEIGHT);
-        b0 = vload4(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
+        a0 = vload4(0, src_addr_a);
+        b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
 
         c00 = fma(a0.s0, b0.s0, c00);
         c01 = fma(a0.s0, b0.s1, c01);
@@ -384,8 +423,38 @@
         c33 = fma(a0.s3, b0.s3, c33);
 
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a + 12 * MULT_INTERLEAVE4X4_HEIGHT);
-        b0 = vload4(0, src_addr_b + 12 * MULT_TRANSPOSE1XW_WIDTH);
+        a0 = vload4(0, src_addr_a);
+        b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c00 = fma(a0.s0, b0.s0, c00);
+        c01 = fma(a0.s0, b0.s1, c01);
+        c02 = fma(a0.s0, b0.s2, c02);
+        c03 = fma(a0.s0, b0.s3, c03);
+
+        c10 = fma(a0.s1, b0.s0, c10);
+        c11 = fma(a0.s1, b0.s1, c11);
+        c12 = fma(a0.s1, b0.s2, c12);
+        c13 = fma(a0.s1, b0.s3, c13);
+
+        c20 = fma(a0.s2, b0.s0, c20);
+        c21 = fma(a0.s2, b0.s1, c21);
+        c22 = fma(a0.s2, b0.s2, c22);
+        c23 = fma(a0.s2, b0.s3, c23);
+
+        c30 = fma(a0.s3, b0.s0, c30);
+        c31 = fma(a0.s3, b0.s1, c31);
+        c32 = fma(a0.s3, b0.s2, c32);
+        c33 = fma(a0.s3, b0.s3, c33);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
 
         c00 = fma(a0.s0, b0.s0, c00);
         c01 = fma(a0.s0, b0.s1, c01);
@@ -408,12 +477,15 @@
         c33 = fma(a0.s3, b0.s3, c33);
     }
 
-    for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * MULT_TRANSPOSE1XW_WIDTH))
+    for(; i < (int)(COLS_MTX_B); ++i)
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
         float4 a0 = vload4(0, src_addr_a);
         float4 b0 = vload4(0, src_addr_b);
 
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
         c00 = fma(a0.s0, b0.s0, c00);
         c01 = fma(a0.s0, b0.s1, c01);
         c02 = fma(a0.s0, b0.s2, c02);
@@ -458,13 +530,22 @@
     c33 = c33 * ALPHA;
 #endif // defined(ALPHA)
 
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
     // Store 4x4 block
-    vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));
-    vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));
-    vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(offset(&dst, 0, 2)));
-    vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(offset(&dst, 0, 3)));
+    vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
+    vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
+    vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(dst_addr + 2 * dst_stride_y));
+    vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(dst_addr + 3 * dst_stride_y));
 }
 
+// Undefine local defines
+#undef COLS_MTX_B
+
 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
@@ -472,6 +553,8 @@
  * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -494,10 +577,14 @@
  */
 __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
                                                  IMAGE_DECLARATION(src1),
-                                                 IMAGE_DECLARATION(dst))
+                                                 IMAGE_DECLARATION(dst),
+                                                 uint src0_stride_z,
+                                                 uint src1_stride_z,
+                                                 uint dst_stride_z)
 {
     int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
     int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+    int z = get_global_id(2);
 
     // Offset
     const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
@@ -505,8 +592,18 @@
 
     // src_addr_a = address of matrix A
     // src_addr_b = address of matrix B
-    __global half *src_addr_a = (__global half *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
-    __global half *src_addr_b = (__global half *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
 
     // Compute end row address for matrix B
     __global half *src_end_addr_b = src_addr_b + COLS_B;
@@ -564,12 +661,231 @@
     c30 = c30 * (half8)ALPHA;
 #endif // defined(ALPHA)
 
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
     // Store 4x8 block
-    vstore8(c00, 0, (__global half *)(offset(&dst, 0, 0)));
-    vstore8(c10, 0, (__global half *)(offset(&dst, 0, 1)));
-    vstore8(c20, 0, (__global half *)(offset(&dst, 0, 2)));
-    vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));
+    vstore8(c00, 0, (__global half *)(dst_addr + 0 * dst_stride_y));
+    vstore8(c10, 0, (__global half *)(dst_addr + 1 * dst_stride_y));
+    vstore8(c20, 0, (__global half *)(dst_addr + 2 * dst_stride_y));
+    vstore8(c30, 0, (__global half *)(dst_addr + 3 * dst_stride_y));
 }
+
+/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
+                                                         IMAGE_DECLARATION(src1),
+                                                         IMAGE_DECLARATION(dst),
+                                                         uint src0_stride_z,
+                                                         uint src1_stride_z,
+                                                         uint dst_stride_z)
+{
+    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+    int z = get_global_id(2);
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+    // Compute end row address for matrix B
+    __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    half8 c00 = 0.0f;
+    half8 c10 = 0.0f;
+    half8 c20 = 0.0f;
+    half8 c30 = 0.0f;
+
+#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))
+
+    int i = 0;
+    for(; i <= (int)(COLS_MTX_B - 4); i += 4)
+    {
+#if MULT_INTERLEAVE4X4_HEIGHT == 1
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half8 a0 = vload8(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c00 = fma((half8)a0.s0, b0, c00);
+        c10 = fma((half8)a0.s1, b0, c10);
+        c20 = fma((half8)a0.s2, b0, c20);
+        c30 = fma((half8)a0.s3, b0, c30);
+
+        // Load values from matrix B (transposed)
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c00 = fma((half8)a0.s4, b0, c00);
+        c10 = fma((half8)a0.s5, b0, c10);
+        c20 = fma((half8)a0.s6, b0, c20);
+        c30 = fma((half8)a0.s7, b0, c30);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload8(0, src_addr_a);
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c00 = fma((half8)a0.s0, b0, c00);
+        c10 = fma((half8)a0.s1, b0, c10);
+        c20 = fma((half8)a0.s2, b0, c20);
+        c30 = fma((half8)a0.s3, b0, c30);
+
+        // Load values from matrix B (transposed)
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c00 = fma((half8)a0.s4, b0, c00);
+        c10 = fma((half8)a0.s5, b0, c10);
+        c20 = fma((half8)a0.s6, b0, c20);
+        c30 = fma((half8)a0.s7, b0, c30);
+#else  // MULT_INTERLEAVE4X4_HEIGHT == 1
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c00 = fma((half8)a0.s0, b0, c00);
+        c10 = fma((half8)a0.s1, b0, c10);
+        c20 = fma((half8)a0.s2, b0, c20);
+        c30 = fma((half8)a0.s3, b0, c30);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c00 = fma((half8)a0.s0, b0, c00);
+        c10 = fma((half8)a0.s1, b0, c10);
+        c20 = fma((half8)a0.s2, b0, c20);
+        c30 = fma((half8)a0.s3, b0, c30);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c00 = fma((half8)a0.s0, b0, c00);
+        c10 = fma((half8)a0.s1, b0, c10);
+        c20 = fma((half8)a0.s2, b0, c20);
+        c30 = fma((half8)a0.s3, b0, c30);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c00 = fma((half8)a0.s0, b0, c00);
+        c10 = fma((half8)a0.s1, b0, c10);
+        c20 = fma((half8)a0.s2, b0, c20);
+        c30 = fma((half8)a0.s3, b0, c30);
+#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
+    }
+
+    for(; i < (int)(COLS_MTX_B); ++i)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+        src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+        c00 = fma((half8)a0.s0, b0, c00);
+        c10 = fma((half8)a0.s1, b0, c10);
+        c20 = fma((half8)a0.s2, b0, c20);
+        c30 = fma((half8)a0.s3, b0, c30);
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+#if defined(ALPHA)
+    // Multiply by the weight of matrix product
+    c00 = c00 * (half8)ALPHA;
+    c10 = c10 * (half8)ALPHA;
+    c20 = c20 * (half8)ALPHA;
+    c30 = c30 * (half8)ALPHA;
+#endif // defined(ALPHA)
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+    // Store 4x8 block
+    vstore8(c00, 0, (__global half *)(dst_addr + 0 * dst_stride_y));
+    vstore8(c10, 0, (__global half *)(dst_addr + 1 * dst_stride_y));
+    vstore8(c20, 0, (__global half *)(dst_addr + 2 * dst_stride_y));
+    vstore8(c30, 0, (__global half *)(dst_addr + 3 * dst_stride_y));
+}
+
+// Undefine local defines
+#undef COLS_MTX_B
+
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
 
 #if defined(FIXED_POINT_POSITION)
@@ -579,8 +895,9 @@
  * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- *
- * @note: ALPHA must be passed in 8 bit fixed point format
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
+ * @note:ALPHA must be passed in 8 bit fixed point format
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: QS8
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -603,10 +920,14 @@
  */
 __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
                                                  IMAGE_DECLARATION(src1),
-                                                 IMAGE_DECLARATION(dst))
+                                                 IMAGE_DECLARATION(dst),
+                                                 uint src0_stride_z,
+                                                 uint src1_stride_z,
+                                                 uint dst_stride_z)
 {
     int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
     int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+    int z = get_global_id(2);
 
     // Offset
     const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
@@ -614,8 +935,18 @@
 
     // src_addr_a = address of matrix A
     // src_addr_b = address of matrix B
-    __global char *src_addr_a = src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    __global char *src_addr_b = src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes;
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global char *src_addr_a = (__global char *)(src0_ptr + src0_addr_in_bytes);
+    __global char *src_addr_b = (__global char *)(src1_ptr + src1_addr_in_bytes);
 
     // Compute end row address for matrix B
     __global char *src_end_addr_b = src_addr_b + COLS_B;
@@ -667,11 +998,17 @@
     c30_qs8 = mul_sat_qs8x16(c30_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
 #endif // defined(ALPHA)
 
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
     // Store 16x4 block
-    vstore16(c00_qs8, 0, (__global char *)(offset(&dst, 0, 0)));
-    vstore16(c10_qs8, 0, (__global char *)(offset(&dst, 0, 1)));
-    vstore16(c20_qs8, 0, (__global char *)(offset(&dst, 0, 2)));
-    vstore16(c30_qs8, 0, (__global char *)(offset(&dst, 0, 3)));
+    vstore16(c00_qs8, 0, (__global char *)(dst_addr + 0 * dst_stride_y));
+    vstore16(c10_qs8, 0, (__global char *)(dst_addr + 1 * dst_stride_y));
+    vstore16(c20_qs8, 0, (__global char *)(dst_addr + 2 * dst_stride_y));
+    vstore16(c30_qs8, 0, (__global char *)(dst_addr + 3 * dst_stride_y));
 }
 
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 16 bit fixed point precision
@@ -680,8 +1017,9 @@
  * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- *
- * @note: ALPHA must be passed in 16 bit fixed point format
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
+ * @note:ALPHA must be passed in 16 bit fixed point format
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: QS16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -704,10 +1042,14 @@
  */
 __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
                                                   IMAGE_DECLARATION(src1),
-                                                  IMAGE_DECLARATION(dst))
+                                                  IMAGE_DECLARATION(dst),
+                                                  uint src0_stride_z,
+                                                  uint src1_stride_z,
+                                                  uint dst_stride_z)
 {
     int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
     int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+    int z = get_global_id(2);
 
     // Offset
     const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
@@ -715,8 +1057,18 @@
 
     // src_addr_a = address of matrix A
     // src_addr_b = address of matrix B
-    __global short *src_addr_a = (__global short *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
-    __global short *src_addr_b = (__global short *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global short *src_addr_a = (__global short *)(src0_ptr + src0_addr_in_bytes);
+    __global short *src_addr_b = (__global short *)(src1_ptr + src1_addr_in_bytes);
 
     // Compute end row address for matrix B
     __global short *src_end_addr_b = src_addr_b + COLS_B;
@@ -759,11 +1111,17 @@
     c30_qs16 = mul_sat_qs16x8(c30_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
 #endif // defined(ALPHA)
 
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
     // Store 8x4 block
-    vstore8(c00_qs16, 0, (__global short *)(offset(&dst, 0, 0)));
-    vstore8(c10_qs16, 0, (__global short *)(offset(&dst, 0, 1)));
-    vstore8(c20_qs16, 0, (__global short *)(offset(&dst, 0, 2)));
-    vstore8(c30_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
+    vstore8(c00_qs16, 0, (__global short *)(dst_addr + 0 * dst_stride_y));
+    vstore8(c10_qs16, 0, (__global short *)(dst_addr + 1 * dst_stride_y));
+    vstore8(c20_qs16, 0, (__global short *)(dst_addr + 2 * dst_stride_y));
+    vstore8(c30_qs16, 0, (__global short *)(dst_addr + 3 * dst_stride_y));
 }
 #endif // defined(FIXED_POINT_POSITION)
 #endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
@@ -771,12 +1129,14 @@
 #if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
 #if defined(DATA_TYPE)
 #define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
  *
  * @note This OpenCL kernel works with floating point data types (F16/F32)
  * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
  * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
  * @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -799,7 +1159,10 @@
  */
 __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
                                      IMAGE_DECLARATION(src1),
-                                     IMAGE_DECLARATION(dst))
+                                     IMAGE_DECLARATION(dst),
+                                     uint src0_stride_z,
+                                     uint src1_stride_z,
+                                     uint dst_stride_z)
 {
     int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
 
@@ -812,6 +1175,16 @@
     // Update address for the matrix B
     src_addr.s1 += idx * sizeof(DATA_TYPE);
 
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
     int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));
 
     VECTOR_TYPE acc0 = 0.0f;
@@ -895,43 +1268,51 @@
     // Compute destination address
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
     // Multiply by the weight of matrix-matrix product and store the result
 #if defined(ALPHA)
     acc0 = acc0 * (VECTOR_TYPE)ALPHA;
 #endif // defined(ALPHA)
     VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-    (acc0, 0, (__global DATA_TYPE *)(offset(&dst, 0, 0)));
+    (acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if defined(ALPHA)
     acc1 = acc1 * (VECTOR_TYPE)ALPHA;
 #endif // defined(ALPHA)
     VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-    (acc1, 0, (__global DATA_TYPE *)(offset(&dst, 0, 1)));
+    (acc1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if defined(ALPHA)
     acc2 = acc2 * (VECTOR_TYPE)ALPHA;
 #endif // defined(ALPHA)
     VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-    (acc2, 0, (__global DATA_TYPE *)(offset(&dst, 0, 2)));
+    (acc2, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 #if defined(ALPHA)
     acc3 = acc3 * (VECTOR_TYPE)ALPHA;
 #endif // defined(ALPHA)
     VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-    (acc3, 0, (__global DATA_TYPE *)(offset(&dst, 0, 3)));
+    (acc3, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 }
 #endif // defined(DATA_TYPE)
 
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
  *
  * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
  * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
  * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
  * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
  * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -954,7 +1335,10 @@
  */
 __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
                                                  IMAGE_DECLARATION(src1),
-                                                 IMAGE_DECLARATION(dst))
+                                                 IMAGE_DECLARATION(dst),
+                                                 uint src0_stride_z,
+                                                 uint src1_stride_z,
+                                                 uint dst_stride_z)
 {
     int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
 
@@ -967,8 +1351,15 @@
     // Update address for matrix B
     src_addr.s1 += idx * sizeof(float);
 
-    // Address boundary for matrix A
-    int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float));
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
 
     // Initialize accumulators
     float acc00 = 0.0f;
@@ -998,72 +1389,162 @@
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
     // A and B src indices get incremented at the same time.
-    for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+    int i = 0;
+    for(; i <= ((int)COLS_A - 4); i += 4)
     {
-        // Load values from matrix A
-        float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+        // Load values from matrix A and matrix B
+        float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        float2 a1 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+        float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        float2 a2 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+        float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        float2 a3 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+        float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        // Load values from matrix B
-        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
-        float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
 
         // Multiply and accumulate
         acc00 = fma(a0.s0, b0.s0, acc00);
-        acc00 = fma(a0.s1, b1.s0, acc00);
         acc01 = fma(a0.s0, b0.s1, acc01);
-        acc01 = fma(a0.s1, b1.s1, acc01);
         acc02 = fma(a0.s0, b0.s2, acc02);
-        acc02 = fma(a0.s1, b1.s2, acc02);
-        acc03 = fma(a0.s1, b1.s3, acc03);
         acc03 = fma(a0.s0, b0.s3, acc03);
 
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
         acc10 = fma(a1.s0, b0.s0, acc10);
         acc11 = fma(a1.s0, b0.s1, acc11);
         acc12 = fma(a1.s0, b0.s2, acc12);
         acc13 = fma(a1.s0, b0.s3, acc13);
 
-        acc10 = fma(a1.s1, b1.s0, acc10);
-        acc11 = fma(a1.s1, b1.s1, acc11);
-        acc12 = fma(a1.s1, b1.s2, acc12);
-        acc13 = fma(a1.s1, b1.s3, acc13);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
         acc20 = fma(a2.s0, b0.s0, acc20);
         acc21 = fma(a2.s0, b0.s1, acc21);
         acc22 = fma(a2.s0, b0.s2, acc22);
         acc23 = fma(a2.s0, b0.s3, acc23);
 
-        acc20 = fma(a2.s1, b1.s0, acc20);
-        acc21 = fma(a2.s1, b1.s1, acc21);
-        acc22 = fma(a2.s1, b1.s2, acc22);
-        acc23 = fma(a2.s1, b1.s3, acc23);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
         acc30 = fma(a3.s0, b0.s0, acc30);
         acc31 = fma(a3.s0, b0.s1, acc31);
         acc32 = fma(a3.s0, b0.s2, acc32);
         acc33 = fma(a3.s0, b0.s3, acc33);
-
-        acc30 = fma(a3.s1, b1.s0, acc30);
-        acc31 = fma(a3.s1, b1.s1, acc31);
-        acc32 = fma(a3.s1, b1.s2, acc32);
-        acc33 = fma(a3.s1, b1.s3, acc33);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        // Load values from matrix A and matrix B
+        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc00 = fma(a0.s1, b0.s0, acc00);
+        acc01 = fma(a0.s1, b0.s1, acc01);
+        acc02 = fma(a0.s1, b0.s2, acc02);
+        acc03 = fma(a0.s1, b0.s3, acc03);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+        acc10 = fma(a1.s1, b0.s0, acc10);
+        acc11 = fma(a1.s1, b0.s1, acc11);
+        acc12 = fma(a1.s1, b0.s2, acc12);
+        acc13 = fma(a1.s1, b0.s3, acc13);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+        acc20 = fma(a2.s1, b0.s0, acc20);
+        acc21 = fma(a2.s1, b0.s1, acc21);
+        acc22 = fma(a2.s1, b0.s2, acc22);
+        acc23 = fma(a2.s1, b0.s3, acc23);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        acc30 = fma(a3.s1, b0.s0, acc30);
+        acc31 = fma(a3.s1, b0.s1, acc31);
+        acc32 = fma(a3.s1, b0.s2, acc32);
+        acc33 = fma(a3.s1, b0.s3, acc33);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        // Load values from matrix A and matrix B
+        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc00 = fma(a0.s2, b0.s0, acc00);
+        acc01 = fma(a0.s2, b0.s1, acc01);
+        acc02 = fma(a0.s2, b0.s2, acc02);
+        acc03 = fma(a0.s2, b0.s3, acc03);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+        acc10 = fma(a1.s2, b0.s0, acc10);
+        acc11 = fma(a1.s2, b0.s1, acc11);
+        acc12 = fma(a1.s2, b0.s2, acc12);
+        acc13 = fma(a1.s2, b0.s3, acc13);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+        acc20 = fma(a2.s2, b0.s0, acc20);
+        acc21 = fma(a2.s2, b0.s1, acc21);
+        acc22 = fma(a2.s2, b0.s2, acc22);
+        acc23 = fma(a2.s2, b0.s3, acc23);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        acc30 = fma(a3.s2, b0.s0, acc30);
+        acc31 = fma(a3.s2, b0.s1, acc31);
+        acc32 = fma(a3.s2, b0.s2, acc32);
+        acc33 = fma(a3.s2, b0.s3, acc33);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        // Load values from matrix A and matrix B
+        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc00 = fma(a0.s3, b0.s0, acc00);
+        acc01 = fma(a0.s3, b0.s1, acc01);
+        acc02 = fma(a0.s3, b0.s2, acc02);
+        acc03 = fma(a0.s3, b0.s3, acc03);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+        acc10 = fma(a1.s3, b0.s0, acc10);
+        acc11 = fma(a1.s3, b0.s1, acc11);
+        acc12 = fma(a1.s3, b0.s2, acc12);
+        acc13 = fma(a1.s3, b0.s3, acc13);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+        acc20 = fma(a2.s3, b0.s0, acc20);
+        acc21 = fma(a2.s3, b0.s1, acc21);
+        acc22 = fma(a2.s3, b0.s2, acc22);
+        acc23 = fma(a2.s3, b0.s3, acc23);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        acc30 = fma(a3.s3, b0.s0, acc30);
+        acc31 = fma(a3.s3, b0.s1, acc31);
+        acc32 = fma(a3.s3, b0.s2, acc32);
+        acc33 = fma(a3.s3, b0.s3, acc33);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        src_addr.s0 += 4 * sizeof(float);
     }
 
-    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
+    for(; i < (int)COLS_A; ++i)
     {
         // Load values from matrix A
-        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+        float a0 = *((__global float *)(src0_ptr + src_addr.s0));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
         float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
@@ -1075,6 +1556,7 @@
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
         // Load values from matrix B
         float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
 
         // Multiply and accumulate
         acc00 = fma(a0, b0.s0, acc00);
@@ -1099,6 +1581,8 @@
         acc32 = fma(a3, b0.s2, acc32);
         acc33 = fma(a3, b0.s3, acc33);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        src_addr.s0 += sizeof(float);
     }
 
     // Compute destination address
@@ -1112,8 +1596,14 @@
     acc03 = acc03 * ALPHA;
 #endif // defined(ALPHA)
 
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
     float4 acc0 = ((float4)(acc00, acc01, acc02, acc03));
-    vstore4(acc0, 0, (__global float *)(offset(&dst, 0, 0)));
+    vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y));
 
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if defined(ALPHA)
@@ -1123,7 +1613,7 @@
     acc13 = acc13 * ALPHA;
 #endif // defined(ALPHA)
     float4 acc1 = ((float4)(acc10, acc11, acc12, acc13));
-    vstore4(acc1, 0, (__global float *)(offset(&dst, 0, 1)));
+    vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if defined(ALPHA)
@@ -1133,7 +1623,7 @@
     acc23 = acc23 * ALPHA;
 #endif // defined(ALPHA)
     float4 acc2 = ((float4)(acc20, acc21, acc22, acc23));
-    vstore4(acc2, 0, (__global float *)(offset(&dst, 0, 2)));
+    vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 #if defined(ALPHA)
@@ -1143,7 +1633,7 @@
     acc33 = acc33 * ALPHA;
 #endif // defined(ALPHA)
     float4 acc3 = ((float4)(acc30, acc31, acc32, acc33));
-    vstore4(acc3, 0, (__global float *)(offset(&dst, 0, 3)));
+    vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 }
 
@@ -1155,6 +1645,8 @@
  * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.
  * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
  * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -1177,7 +1669,10 @@
  */
 __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
                                                       IMAGE_DECLARATION(src1),
-                                                      IMAGE_DECLARATION(dst))
+                                                      IMAGE_DECLARATION(dst),
+                                                      uint src0_stride_z,
+                                                      uint src1_stride_z,
+                                                      uint dst_stride_z)
 {
     // Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
     int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
@@ -1191,8 +1686,15 @@
     // Update address for the matrix B
     src_addr.s1 += idx * sizeof(float);
 
-    // Address boundary for the matrix A
-    int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float));
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
 
     // Initialize accumulators
     float acc00 = 0.0f;
@@ -1212,67 +1714,114 @@
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
     // A and B src indices get incremented at the same time.
-    for(; src_addr.s0 <= (end_row_vec_a - 4 * (int)sizeof(float)); src_addr += (int2)(4 * sizeof(float), 4 * src1_stride_y))
+    int i = 0;
+    for(; i <= ((int)COLS_A - 8); i += 8)
     {
         // Load values from matrix A
-        float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+        float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));
 
         // Load values from matrix B
-        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
-        float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
-        float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));
-        float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));
+        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
 
         // Multiply and accumulate
         acc00 = fma(a0.s0, b0.s0, acc00);
         acc00 = fma(a0.s1, b1.s0, acc00);
         acc00 = fma(a0.s2, b2.s0, acc00);
         acc00 = fma(a0.s3, b3.s0, acc00);
+        acc00 = fma(a0.s4, b4.s0, acc00);
+        acc00 = fma(a0.s5, b5.s0, acc00);
+        acc00 = fma(a0.s6, b6.s0, acc00);
+        acc00 = fma(a0.s7, b7.s0, acc00);
 
         acc01 = fma(a0.s0, b0.s1, acc01);
         acc01 = fma(a0.s1, b1.s1, acc01);
         acc01 = fma(a0.s2, b2.s1, acc01);
         acc01 = fma(a0.s3, b3.s1, acc01);
+        acc01 = fma(a0.s4, b4.s1, acc01);
+        acc01 = fma(a0.s5, b5.s1, acc01);
+        acc01 = fma(a0.s6, b6.s1, acc01);
+        acc01 = fma(a0.s7, b7.s1, acc01);
 
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        a0    = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+        a0    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
         acc10 = fma(a0.s0, b0.s0, acc10);
         acc10 = fma(a0.s1, b1.s0, acc10);
         acc10 = fma(a0.s2, b2.s0, acc10);
         acc10 = fma(a0.s3, b3.s0, acc10);
+        acc10 = fma(a0.s4, b4.s0, acc10);
+        acc10 = fma(a0.s5, b5.s0, acc10);
+        acc10 = fma(a0.s6, b6.s0, acc10);
+        acc10 = fma(a0.s7, b7.s0, acc10);
 
         acc11 = fma(a0.s0, b0.s1, acc11);
         acc11 = fma(a0.s1, b1.s1, acc11);
         acc11 = fma(a0.s2, b2.s1, acc11);
         acc11 = fma(a0.s3, b3.s1, acc11);
+        acc11 = fma(a0.s4, b4.s1, acc11);
+        acc11 = fma(a0.s5, b5.s1, acc11);
+        acc11 = fma(a0.s6, b6.s1, acc11);
+        acc11 = fma(a0.s7, b7.s1, acc11);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        a0    = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+        a0    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
         acc20 = fma(a0.s0, b0.s0, acc20);
         acc20 = fma(a0.s1, b1.s0, acc20);
         acc20 = fma(a0.s2, b2.s0, acc20);
         acc20 = fma(a0.s3, b3.s0, acc20);
+        acc20 = fma(a0.s4, b4.s0, acc20);
+        acc20 = fma(a0.s5, b5.s0, acc20);
+        acc20 = fma(a0.s6, b6.s0, acc20);
+        acc20 = fma(a0.s7, b7.s0, acc20);
 
         acc21 = fma(a0.s0, b0.s1, acc21);
         acc21 = fma(a0.s1, b1.s1, acc21);
         acc21 = fma(a0.s2, b2.s1, acc21);
         acc21 = fma(a0.s3, b3.s1, acc21);
+        acc21 = fma(a0.s4, b4.s1, acc21);
+        acc21 = fma(a0.s5, b5.s1, acc21);
+        acc21 = fma(a0.s6, b6.s1, acc21);
+        acc21 = fma(a0.s7, b7.s1, acc21);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        a0    = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+        a0    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
         acc30 = fma(a0.s0, b0.s0, acc30);
         acc30 = fma(a0.s1, b1.s0, acc30);
         acc30 = fma(a0.s2, b2.s0, acc30);
         acc30 = fma(a0.s3, b3.s0, acc30);
+        acc30 = fma(a0.s4, b4.s0, acc30);
+        acc30 = fma(a0.s5, b5.s0, acc30);
+        acc30 = fma(a0.s6, b6.s0, acc30);
+        acc30 = fma(a0.s7, b7.s0, acc30);
 
         acc31 = fma(a0.s0, b0.s1, acc31);
         acc31 = fma(a0.s1, b1.s1, acc31);
         acc31 = fma(a0.s2, b2.s1, acc31);
         acc31 = fma(a0.s3, b3.s1, acc31);
+        acc31 = fma(a0.s4, b4.s1, acc31);
+        acc31 = fma(a0.s5, b5.s1, acc31);
+        acc31 = fma(a0.s6, b6.s1, acc31);
+        acc31 = fma(a0.s7, b7.s1, acc31);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        src_addr.s0 += sizeof(float) * 8;
     }
     // float size increment
-    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(4, src1_stride_y))
+    for(; i < (int)COLS_A; ++i)
     {
         // Load values from matrix A
         float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
@@ -1287,6 +1836,7 @@
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
         // Load values from matrix B
         float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
 
         // Multiply and accumulate
         acc00 = fma(a0, b0.s0, acc00);
@@ -1303,25 +1853,33 @@
         acc30 = fma(a3, b0.s0, acc30);
         acc31 = fma(a3, b0.s1, acc31);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        src_addr.s0 += sizeof(float);
     }
 
     // Compute destination address
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
     // Multiply by the weight of matrix-matrix product and store the result
 #if defined(ALPHA)
     acc00 = acc00 * ALPHA;
     acc01 = acc01 * ALPHA;
 #endif // defined(ALPHA)
     float2 acc0 = ((float2)(acc00, acc01));
-    vstore2(acc0, 0, (__global float *)(offset(&dst, 0, 0)));
+    vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if defined(ALPHA)
     acc10 = acc10 * ALPHA;
     acc11 = acc11 * ALPHA;
 #endif // defined(ALPHA)
     float2 acc1 = ((float2)(acc10, acc11));
-    vstore2(acc1, 0, (__global float *)(offset(&dst, 0, 1)));
+    vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if defined(ALPHA)
@@ -1329,7 +1887,7 @@
     acc21 = acc21 * ALPHA;
 #endif // defined(ALPHA)
     float2 acc2 = ((float2)(acc20, acc21));
-    vstore2(acc2, 0, (__global float *)(offset(&dst, 0, 2)));
+    vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 #if defined(ALPHA)
@@ -1337,7 +1895,212 @@
     acc31 = acc31 * ALPHA;
 #endif // defined(ALPHA)
     float2 acc3 = (float2)(acc30, acc31);
-    vstore2(acc3, 0, (__global float *)(offset(&dst, 0, 3)));
+    vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
+ * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+                                                 IMAGE_DECLARATION(dst),
+                                                 uint src0_stride_z,
+                                                 uint src1_stride_z,
+                                                 uint dst_stride_z)
+{
+    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(half);
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    half8 acc0 = 0.0h;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half8 acc1 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half8 acc2 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half8 acc3 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    int i = 0;
+    for(; i <= ((int)COLS_A - 4); i += 4)
+    {
+        // Load values from matrix A
+        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        // Load values from matrix B
+        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Accumulate
+        acc0 = fma(b0, (half8)a0.s0, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (half8)a1.s0, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (half8)a2.s0, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (half8)a3.s0, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (half8)a0.s1, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (half8)a1.s1, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (half8)a2.s1, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (half8)a3.s1, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (half8)a0.s2, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (half8)a1.s2, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (half8)a2.s2, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (half8)a3.s2, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (half8)a0.s3, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (half8)a1.s3, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (half8)a2.s3, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (half8)a3.s3, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        src_addr.s0 += 4 * sizeof(half);
+    }
+
+    for(; i < (int)COLS_A; ++i)
+    {
+        // Load values from matrix A
+        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        // Load values from matrix B
+        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+
+        src_addr += (int2)(sizeof(half), src1_stride_y);
+
+        // Accumulate
+        acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;
+#endif                                   // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;
+#endif                                   // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;
+#endif                                   // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    acc0 = acc0 * (half8)ALPHA;
+#endif // defined(ALPHA)
+    vstore8(acc0, 0, (__global half *)(dst_addr + 0 * dst_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if defined(ALPHA)
+    acc1 = acc1 * (half8)ALPHA;
+#endif // defined(ALPHA)
+    vstore8(acc1, 0, (__global half *)(dst_addr + 1 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if defined(ALPHA)
+    acc2 = acc2 * (half8)ALPHA;
+#endif // defined(ALPHA)
+    vstore8(acc2, 0, (__global half *)(dst_addr + 2 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if defined(ALPHA)
+    acc3 = acc3 * (half8)ALPHA;
+#endif // defined(ALPHA)
+    vstore8(acc3, 0, (__global half *)(dst_addr + 3 * dst_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 }
 
@@ -1349,6 +2112,8 @@
  * @note The number matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
  * @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
  * @note The optional alpha value must be passed in 8 bit fixed point format using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: QS8/QS16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -1371,7 +2136,10 @@
  */
 __kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0),
                           IMAGE_DECLARATION(src1),
-                          IMAGE_DECLARATION(dst))
+                          IMAGE_DECLARATION(dst),
+                          uint src0_stride_z,
+                          uint src1_stride_z,
+                          uint dst_stride_z)
 {
     int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
 
@@ -1384,6 +2152,16 @@
     // Update address for the matrix B
     src_addr.s1 += idx * sizeof(char);
 
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
     int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(char));
 
     short8 acc00 = 0;
@@ -1475,33 +2253,39 @@
     // Compute destination address
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
     // Multiply by the weight of matrix product and store the result
     char16 acc_qs8;
     acc_qs8 = convert_char16_sat((short16)(acc00, acc01));
 #if defined(ALPHA)
     acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
 #endif // defined(ALPHA)
-    vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 0)));
+    vstore16(acc_qs8, 0, (__global char *)(dst_addr + 0 * dst_stride_y));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
     acc_qs8 = convert_char16_sat((short16)(acc10, acc11));
 #if defined(ALPHA)
     acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
 #endif // defined(ALPHA)
-    vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 1)));
+    vstore16(acc_qs8, 0, (__global char *)(dst_addr + 1 * dst_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
     acc_qs8 = convert_char16_sat((short16)(acc20, acc21));
 #if defined(ALPHA)
     acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
 #endif // defined(ALPHA)
-    vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 2)));
+    vstore16(acc_qs8, 0, (__global char *)(dst_addr + 2 * dst_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
     acc_qs8 = convert_char16_sat((short16)(acc30, acc31));
 #if defined(ALPHA)
     acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
 #endif // defined(ALPHA)
-    vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 3)));
+    vstore16(acc_qs8, 0, (__global char *)(dst_addr + 3 * dst_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 }
 
@@ -1512,6 +2296,8 @@
  * @note The number of matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
  * @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
  * @note The optional alpha value must be passed in 16 bit fixed point format using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: QS8/QS16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -1534,7 +2320,10 @@
  */
 __kernel void gemm_mm_qs16(IMAGE_DECLARATION(src0),
                            IMAGE_DECLARATION(src1),
-                           IMAGE_DECLARATION(dst))
+                           IMAGE_DECLARATION(dst),
+                           uint src0_stride_z,
+                           uint src1_stride_z,
+                           uint dst_stride_z)
 {
     int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
 
@@ -1547,6 +2336,16 @@
     // Update address for the matrix B
     src_addr.s1 += idx * sizeof(short);
 
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
     int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(short));
 
     int8 acc0 = 0;
@@ -1622,33 +2421,39 @@
     // Compute destination address
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
     // Multiply by the weight of matrix product and store the result
     short8 acc_qs16;
     acc_qs16 = convert_short8_sat(acc0);
 #if defined(ALPHA)
     acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
 #endif // defined(ALPHA)
-    vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 0)));
+    vstore8(acc_qs16, 0, (__global short *)(dst_addr + 0 * dst_stride_y));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
     acc_qs16 = convert_short8_sat(acc1);
 #if defined(ALPHA)
     acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
 #endif // defined(ALPHA)
-    vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 1)));
+    vstore8(acc_qs16, 0, (__global short *)(dst_addr + 1 * dst_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
     acc_qs16 = convert_short8_sat(acc2);
 #if defined(ALPHA)
     acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
 #endif // defined(ALPHA)
-    vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 2)));
+    vstore8(acc_qs16, 0, (__global short *)(dst_addr + 2 * dst_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
     acc_qs16 = convert_short8_sat(acc3);
 #if defined(ALPHA)
     acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
 #endif // defined(ALPHA)
-    vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
+    vstore8(acc_qs16, 0, (__global short *)(dst_addr + 3 * dst_stride_y));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 }
 #endif // defined(FIXED_POINT_POSITION)

diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 02c6c4c..615c518 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h

@@ -29,7 +29,9 @@
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
 
 #if defined(ARM_COMPUTE_DEBUG_ENABLED)
+#if defined(cl_arm_printf)
 #pragma OPENCL EXTENSION cl_arm_printf : enable
+#endif // defined(cl_arm_printf)
 #endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
 
 #define EXPAND(x) x

diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index a69bcc1..c314d17 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h

@@ -62,6 +62,7 @@
         b_64 = convert_long##size(b);                                                                        \
         VEC_DATA_TYPE(long, size)                                                                            \
         ab_64 = a_64 * b_64;                                                                                 \
+        /* COMPMID-907 */                                                                                    \
         VEC_DATA_TYPE(int, size)                                                                             \
         ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));                                       \
         return select(ab_x2_high32, INT_MAX, overflow);                                                      \
@@ -366,4 +367,4 @@
 ASYMM_RESCALE_IMPL(8)
 ASYMM_RESCALE_IMPL(16)
 
-#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/hog.cl b/src/core/CL/cl_kernels/hog.cl
index 3d37fbc..407ee2f 100644
--- a/src/core/CL/cl_kernels/hog.cl
+++ b/src/core/CL/cl_kernels/hog.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -351,7 +351,7 @@
 }
 #endif /* NUM_CELLS_PER_BLOCK_HEIGHT and NUM_BINS_PER_BLOCK_X and NUM_BINS_PER_BLOCK and HOG_NORM_TYPE and L2_HYST_THRESHOLD */
 
-#if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(BLOCK_STRIDE_WIDTH) && defined(BLOCK_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT)
+#if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(DETECTION_WINDOW_STRIDE_WIDTH) && defined(DETECTION_WINDOW_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT)
 
 /** This OpenCL kernel computes the HOG detector using linear SVM
  *
@@ -362,8 +362,8 @@
  * -# -DTHRESHOLD = Threshold for the distance between features and SVM classifying plane
  * -# -DMAX_NUM_DETECTION_WINDOWS = Maximum number of possible detection windows. It is equal to the size of the DetectioWindow array
  * -# -DIDX_CLASS = Index of the class to detect
- * -# -DBLOCK_STRIDE_WIDTH = Block stride for the X direction
- * -# -DBLOCK_STRIDE_HEIGHT = Block stride for the Y direction
+ * -# -DDETECTION_WINDOW_STRIDE_WIDTH = Detection window stride for the X direction
+ * -# -DDETECTION_WINDOW_STRIDE_HEIGHT = Detection window stride for the Y direction
  * -# -DDETECTION_WINDOW_WIDTH = Width of the detection window
  * -# -DDETECTION_WINDOW_HEIGHT = Height of the detection window
  *
@@ -443,8 +443,8 @@
         int id = atomic_inc(num_detection_windows);
         if(id < MAX_NUM_DETECTION_WINDOWS)
         {
-            dst[id].x         = get_global_id(0) * BLOCK_STRIDE_WIDTH;
-            dst[id].y         = get_global_id(1) * BLOCK_STRIDE_HEIGHT;
+            dst[id].x         = get_global_id(0) * DETECTION_WINDOW_STRIDE_WIDTH;
+            dst[id].y         = get_global_id(1) * DETECTION_WINDOW_STRIDE_HEIGHT;
             dst[id].width     = DETECTION_WINDOW_WIDTH;
             dst[id].height    = DETECTION_WINDOW_HEIGHT;
             dst[id].idx_class = IDX_CLASS;
@@ -453,4 +453,4 @@
     }
 }
 #endif /* NUM_BLOCKS_PER_DESCRIPTOR_Y && NUM_BINS_PER_DESCRIPTOR_X && THRESHOLD && MAX_NUM_DETECTION_WINDOWS && IDX_CLASS &&
-        * BLOCK_STRIDE_WIDTH && BLOCK_STRIDE_HEIGHT && DETECTION_WINDOW_WIDTH && DETECTION_WINDOW_HEIGHT */
+        * DETECTION_WINDOW_STRIDE_WIDTH && DETECTION_WINDOW_STRIDE_HEIGHT && DETECTION_WINDOW_WIDTH && DETECTION_WINDOW_HEIGHT */

diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl
index 75d99bd..1e85e1b 100644
--- a/src/core/CL/cl_kernels/im2col.cl
+++ b/src/core/CL/cl_kernels/im2col.cl

@@ -680,6 +680,7 @@
  * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
  * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
  * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
@@ -722,10 +723,12 @@
     __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
 
     // Linearize convolution elements
-    for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
+    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
     {
-        for(int x = xi, x_e = xi + KERNEL_WIDTH; x < x_e; ++x, ++output_ptr)
+        int y = yi + yk * DILATION_Y;
+        for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)
         {
+            int x = xi + xk * DILATION_X;
 #if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
             *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
 #else  // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0

diff --git a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
index e1131d5..8a126a0 100644
--- a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
+++ b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,7 @@
     float x;               /**< The x coordinate. */
     float y;               /**< The y coordinate. */
     float tracking_status; /**< A zero indicates a lost point. Initialized to 1 by corner detectors. */
-    float dummy;
+    float dummy;           /**< Dummy member for alignment. */
 } InternalKeypoint;
 
 /** Threshold for the determinant. Used for lost tracking criteria */
@@ -167,7 +167,11 @@
     Keypoint new_point;
     new_point.x               = round(new_point_internal.x);
     new_point.y               = round(new_point_internal.y);
+    new_point.strength        = 0.f;
+    new_point.scale           = 0.f;
+    new_point.orientation     = 0.f;
     new_point.tracking_status = new_point_internal.tracking_status;
+    new_point.error           = 0.f;
 
     // Store new point
     new_points[idx] = new_point;
@@ -352,8 +356,7 @@
  * @param[in]      border_limits                           It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
  * @param[in]      eig_const                               1.0f / (float)(2.0f * window_dimension * window_dimension)
  * @param[in]      level0                                  It is set to 1 if level of pyramid = 0
- * @param[in]      term_iteration                          It is set to 1 if termination = VX_TERM_CRITERIA_ITERATIONS
- * @param[in]      term_epsilon                            It is set to 1 if termination = VX_TERM_CRITERIA_EPSILON
+ * @param[in]      term_epsilon                            It is set to 1 if termination = TERM_CRITERIA_EPSILON
  */
 void __kernel lktracker_stage1(
     IMAGE_DECLARATION(new_image),
@@ -368,7 +371,6 @@
     const float3     border_limits,
     const float      eig_const,
     const int        level0,
-    const int        term_iteration,
     const int        term_epsilon)
 {
     int   idx       = get_global_id(0);
@@ -512,10 +514,7 @@
         // Update previous delta
         prev_delta = delta;
 
-        if(term_iteration == 1)
-        {
-            j++;
-        }
+        j++;
     }
 
     new_points[idx].xy = out_new_point;

diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index dae0b99..2c7ddfd 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl

@@ -62,6 +62,8 @@
 
 #endif /* FIXED_POINT_POSITION */
 
+#define DIV_OP_NHWC(x, y) (x * (VEC_DATA_TYPE(DATA_TYPE, 8))(1.f / y))
+
 #if STRIDE_X == 1
 #define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
 #elif STRIDE_X == 2 /* STRIDE_X == 1 */
@@ -423,7 +425,7 @@
 
 #endif // POOL_AVG
 
-/** Performs a pooling function of pool size equal to N
+/** Performs a pooling function of pool size equal to N  (NCHW)
  *
  * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
  * @note -DFP16 must be passed at compile time if half float data type is used
@@ -451,7 +453,7 @@
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  */
-__kernel void pooling_layer_MxN(
+__kernel void pooling_layer_MxN_nchw(
     TENSOR3D_DECLARATION(input),
     TENSOR3D_DECLARATION(output))
 {
@@ -512,3 +514,97 @@
     *(__global DATA_TYPE *)output.ptr = res;
 }
 #endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+DATA_TYPE calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h,
+                                   const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x = get_global_id(1) * stride_x - pad_x;
+    int start_y = get_global_id(2) * stride_y - pad_y;
+
+#if !defined(EXCLUDE_PADDING)
+    upper_bound_w += pad_x;
+    upper_bound_h += pad_y;
+#endif /* defined(EXCLUDE_PADDING) */
+    const int end_x = min(start_x + pool_size_x, upper_bound_w);
+    const int end_y = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+    return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to N (NHWC)
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32
+ * @note -DFP16 must be passed at compile time if half float data type is used
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pad values must be passed at compile time using -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_MxN_nhwc(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    vdata           = INITIAL_VALUE;
+    DATA_TYPE sdata = INITIAL_VALUE;
+
+    const int idx_width  = get_global_id(1) * STRIDE_X;
+    const int idx_height = get_global_id(2) * STRIDE_Y;
+
+    for(int y = 0; y < POOL_SIZE_Y; ++y)
+    {
+        int y1 = select(y, PAD_Y - idx_height, y + idx_height < PAD_Y || y + idx_height > MAX_HEIGHT);
+        for(int x = 0; x < POOL_SIZE_X; ++x)
+        {
+            int x1 = select(x, PAD_X - idx_width - 1, x + idx_width < PAD_X || x + idx_width > MAX_WIDTH);
+            x1     = select(x1, PAD_X - idx_width - 1, y != y1);
+
+            VEC_DATA_TYPE(DATA_TYPE, 8)
+            data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif /* defined(POOL_L2) */
+            vdata = POOL_OP(vdata, data0);
+        }
+    }
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average pooling
+    vdata = DIV_OP_NHWC(vdata, calculate_avg_scale_nhwc(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    vdata = SQRT_OP(vdata);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    vstore8(vdata, 0, (__global DATA_TYPE *)output.ptr);
+}

diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
index 98850c0..17d893a 100644
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl

@@ -31,6 +31,8 @@
 
 #define DIV_OP(x, y) (x * (1.f / y))
 
+#define DIV_OP_NHWC(x, y) (convert_float8(x) * (float8)(1.f / y))
+
 #if defined(POOL_L2)
 #error "L2 pooling is not supported"
 #endif /* defined(POOL_L2) */
@@ -49,7 +51,7 @@
     return ((end_y - start_y) * (end_x - start_x));
 }
 
-/** Performs a pooling function of pool size equal to N
+/** Performs a pooling function of pool size equal to N (NCHW)
  *
  * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
  * @note In case of average pooling the following information must be passed at compile time:
@@ -75,7 +77,7 @@
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  */
-__kernel void pooling_layer_MxN_quantized(
+__kernel void pooling_layer_MxN_quantized_nchw(
     TENSOR3D_DECLARATION(input),
     TENSOR3D_DECLARATION(output))
 {
@@ -119,3 +121,79 @@
     // Store result
     *(__global uchar *)output.ptr = convert_uchar(res);
 }
+
+int calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h,
+                             const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x = get_global_id(1) * stride_x - pad_x;
+    int start_y = get_global_id(2) * stride_y - pad_y;
+
+    const int end_x = min(start_x + pool_size_x, upper_bound_w);
+    const int end_y = min(start_y + pool_size_y, upper_bound_h);
+
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+
+    return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to N (NHWC)
+ *
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pad values must be passed at compile time using -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_MxN_quantized_nhwc(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    int8 vdata = 0;
+
+    const int idx_width  = get_global_id(1) * STRIDE_X;
+    const int idx_height = get_global_id(2) * STRIDE_Y;
+
+    for(int y = 0; y < POOL_SIZE_Y; ++y)
+    {
+        int y1 = select(y, PAD_Y - idx_height, y + idx_height < PAD_Y || y + idx_height > MAX_HEIGHT);
+        for(int x = 0; x < POOL_SIZE_X; ++x)
+        {
+            int x1      = select(x, PAD_X - idx_width - 1, x + idx_width < PAD_X || x + idx_width > MAX_WIDTH);
+            x1          = select(x1, PAD_X - idx_width - 1, y != y1);
+            uchar8 data = vload8(0, (__global uchar *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
+            int8 data0  = convert_int8(data);
+            vdata       = POOL_OP(vdata, data0);
+        }
+    }
+
+#if defined(POOL_AVG)
+    // Divide by pool region in case of average pooling
+    vdata = convert_int8(round(DIV_OP_NHWC(vdata, calculate_avg_scale_nhwc(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y))));
+#endif /* defined(POOL_AVG) */
+
+    // Store result
+    vstore8(convert_uchar8(vdata), 0, (__global uchar *)output.ptr);
+}
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/winograd.cl b/src/core/CL/cl_kernels/winograd.cl
new file mode 100644
index 0000000..0458e53
--- /dev/null
+++ b/src/core/CL/cl_kernels/winograd.cl

@@ -0,0 +1,1611 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(NUM_CHANNELS)
+
+/** This OpenCL kernel performs Winograd filter transform 3x3 when the data format is NCHW and the output tile is 2x2
+ *
+ * @note The number of channels must be passed at compile time using -DNUM_CHANNELS: e.g. -DNUM_CHANNELS=64
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x2_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, NUM_CHANNELS);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+    float3 w0 = vload3(0, (__global float *)(src_addr + 0 * src_stride_y));
+    float3 w1 = vload3(0, (__global float *)(src_addr + 1 * src_stride_y));
+    float3 w2 = vload3(0, (__global float *)(src_addr + 2 * src_stride_y));
+
+    // Transform the 3x3 tile in a 4x4 tile
+    float4 out0 = 0.0f;
+    float4 out1 = 0.0f;
+    float4 out2 = 0.0f;
+    float4 out3 = 0.0f;
+
+    // Row 0
+    out0.s0 = (w0.s0);
+    out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
+    out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
+    out0.s3 = (w0.s2);
+
+    // Row 1
+    out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
+    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
+    out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
+    out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
+
+    // Row 2
+    out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
+    out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
+    out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
+    out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
+
+    // Row 3
+    out3.s0 = (w2.s0);
+    out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
+    out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
+    out3.s3 = (w2.s2);
+
+    int z  = get_global_id(2);
+    int x0 = z / NUM_CHANNELS; // idx filter
+    int y0 = z % NUM_CHANNELS; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+    // Store the 16 values across the 16 channels
+    *(__global float *)(dst_addr + 0 * dst_stride_z)  = out0.s0;
+    *(__global float *)(dst_addr + 1 * dst_stride_z)  = out0.s1;
+    *(__global float *)(dst_addr + 2 * dst_stride_z)  = out0.s2;
+    *(__global float *)(dst_addr + 3 * dst_stride_z)  = out0.s3;
+    *(__global float *)(dst_addr + 4 * dst_stride_z)  = out1.s0;
+    *(__global float *)(dst_addr + 5 * dst_stride_z)  = out1.s1;
+    *(__global float *)(dst_addr + 6 * dst_stride_z)  = out1.s2;
+    *(__global float *)(dst_addr + 7 * dst_stride_z)  = out1.s3;
+    *(__global float *)(dst_addr + 8 * dst_stride_z)  = out2.s0;
+    *(__global float *)(dst_addr + 9 * dst_stride_z)  = out2.s1;
+    *(__global float *)(dst_addr + 10 * dst_stride_z) = out2.s2;
+    *(__global float *)(dst_addr + 11 * dst_stride_z) = out2.s3;
+    *(__global float *)(dst_addr + 12 * dst_stride_z) = out3.s0;
+    *(__global float *)(dst_addr + 13 * dst_stride_z) = out3.s1;
+    *(__global float *)(dst_addr + 14 * dst_stride_z) = out3.s2;
+    *(__global float *)(dst_addr + 15 * dst_stride_z) = out3.s3;
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x3 when the data format is NCHW and the output tile is 4x4
+ *
+ * @note The number of channels must be passed at compile time using -DNUM_CHANNELS: e.g. -DNUM_CHANNELS=64
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, NUM_CHANNELS);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+    float3 w0 = vload3(0, (__global float *)(src_addr + 0 * src_stride_y));
+    float3 w1 = vload3(0, (__global float *)(src_addr + 1 * src_stride_y));
+    float3 w2 = vload3(0, (__global float *)(src_addr + 2 * src_stride_y));
+
+    // Transform the 3x3 tile in a 6x6 tile
+    float8 out0 = 0.0f;
+    float8 out1 = 0.0f;
+    float8 out2 = 0.0f;
+    float8 out3 = 0.0f;
+    float8 out4 = 0.0f;
+    float8 out5 = 0.0f;
+
+    // Row 0
+    out0.s0 = (w0.s0) / 16.f;
+    out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
+    out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
+    out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+    out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+    out0.s5 = (w0.s2) / 4.f;
+
+    // Row 1
+    out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
+    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+    out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+    out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+    out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+    out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
+
+    // Row 2
+    out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
+    out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+    out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+    out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+    out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+    out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
+
+    // Row 3
+    out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+    out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+    // Row 4
+    out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+    out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+    // Row 5
+    out5.s0 = (w2.s0) / 4.f;
+    out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
+    out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
+    out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+    out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+    out5.s5 = (w2.s2);
+
+    int z  = get_global_id(2);
+    int x0 = z / NUM_CHANNELS; // idx filter
+    int y0 = z % NUM_CHANNELS; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+    // Store the 36 values across the 36 channels
+    *(__global float *)(dst_addr + 0 * dst_stride_z)  = out0.s0;
+    *(__global float *)(dst_addr + 1 * dst_stride_z)  = out0.s1;
+    *(__global float *)(dst_addr + 2 * dst_stride_z)  = out0.s2;
+    *(__global float *)(dst_addr + 3 * dst_stride_z)  = out0.s3;
+    *(__global float *)(dst_addr + 4 * dst_stride_z)  = out0.s4;
+    *(__global float *)(dst_addr + 5 * dst_stride_z)  = out0.s5;
+    *(__global float *)(dst_addr + 6 * dst_stride_z)  = out1.s0;
+    *(__global float *)(dst_addr + 7 * dst_stride_z)  = out1.s1;
+    *(__global float *)(dst_addr + 8 * dst_stride_z)  = out1.s2;
+    *(__global float *)(dst_addr + 9 * dst_stride_z)  = out1.s3;
+    *(__global float *)(dst_addr + 10 * dst_stride_z) = out1.s4;
+    *(__global float *)(dst_addr + 11 * dst_stride_z) = out1.s5;
+    *(__global float *)(dst_addr + 12 * dst_stride_z) = out2.s0;
+    *(__global float *)(dst_addr + 13 * dst_stride_z) = out2.s1;
+    *(__global float *)(dst_addr + 14 * dst_stride_z) = out2.s2;
+    *(__global float *)(dst_addr + 15 * dst_stride_z) = out2.s3;
+    *(__global float *)(dst_addr + 16 * dst_stride_z) = out2.s4;
+    *(__global float *)(dst_addr + 17 * dst_stride_z) = out2.s5;
+    *(__global float *)(dst_addr + 18 * dst_stride_z) = out3.s0;
+    *(__global float *)(dst_addr + 19 * dst_stride_z) = out3.s1;
+    *(__global float *)(dst_addr + 20 * dst_stride_z) = out3.s2;
+    *(__global float *)(dst_addr + 21 * dst_stride_z) = out3.s3;
+    *(__global float *)(dst_addr + 22 * dst_stride_z) = out3.s4;
+    *(__global float *)(dst_addr + 23 * dst_stride_z) = out3.s5;
+    *(__global float *)(dst_addr + 24 * dst_stride_z) = out4.s0;
+    *(__global float *)(dst_addr + 25 * dst_stride_z) = out4.s1;
+    *(__global float *)(dst_addr + 26 * dst_stride_z) = out4.s2;
+    *(__global float *)(dst_addr + 27 * dst_stride_z) = out4.s3;
+    *(__global float *)(dst_addr + 28 * dst_stride_z) = out4.s4;
+    *(__global float *)(dst_addr + 29 * dst_stride_z) = out4.s5;
+    *(__global float *)(dst_addr + 30 * dst_stride_z) = out5.s0;
+    *(__global float *)(dst_addr + 31 * dst_stride_z) = out5.s1;
+    *(__global float *)(dst_addr + 32 * dst_stride_z) = out5.s2;
+    *(__global float *)(dst_addr + 33 * dst_stride_z) = out5.s3;
+    *(__global float *)(dst_addr + 34 * dst_stride_z) = out5.s4;
+    *(__global float *)(dst_addr + 35 * dst_stride_z) = out5.s5;
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x5 when the data format is NCHW and the output tile is 4x4
+ *
+ * @note The number of channels must be passed at compile time using -DNUM_CHANNELS: e.g. -DNUM_CHANNELS=64
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_5x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, NUM_CHANNELS);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+    const char   stride_x = 4 * sizeof(float); // Used for accessing the last value in each row
+    const uchar8 stride_y = (uchar8)(0, 1, 2, 3, 4, 0, 0, 0) * (uchar8)src_stride_y;
+
+    float4 w00 = vload4(0, (__global float *)(src_addr + stride_y.s0));
+    float  w01 = *((__global float *)(src_addr + stride_y.s0 + stride_x));
+    float4 w10 = vload4(0, (__global float *)(src_addr + stride_y.s1));
+    float  w11 = *((__global float *)(src_addr + stride_y.s1 + stride_x));
+    float4 w20 = vload4(0, (__global float *)(src_addr + stride_y.s2));
+    float  w21 = *((__global float *)(src_addr + stride_y.s2 + stride_x));
+    float4 w30 = vload4(0, (__global float *)(src_addr + stride_y.s3));
+    float  w31 = *((__global float *)(src_addr + stride_y.s3 + stride_x));
+    float4 w40 = vload4(0, (__global float *)(src_addr + stride_y.s4));
+    float  w41 = *((__global float *)(src_addr + stride_y.s4 + stride_x));
+
+    // Transform the 3x3 tile in a 8x8 tile
+    float8 out0 = 0.0f;
+    float8 out1 = 0.0f;
+    float8 out2 = 0.0f;
+    float8 out3 = 0.0f;
+    float8 out4 = 0.0f;
+    float8 out5 = 0.0f;
+    float8 out6 = 0.0f;
+    float8 out7 = 0.0f;
+
+    // Row 0
+    out0.s0 = w00.s0;
+    out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
+    out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
+    out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
+    out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
+    out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
+    out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
+    out0.s7 = w01;
+
+    // Row 1
+    out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
+    out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
+                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+    out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
+                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+    out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+    out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+    out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+    out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+    out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
+
+    // Row 2
+    out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
+    out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
+                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+    out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
+                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+    out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+    out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+    out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+    out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+    out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
+
+    // Row 3
+    out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+    out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+    out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+    out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+    out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+    out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+    out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+    out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
+
+    // Row 4
+    out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+    out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+    out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+    out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+    out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+    out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+    out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+    out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
+
+    // Row 5
+    out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
+    out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+    out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+    out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+    out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+    out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+    out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+    out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
+
+    // Row 6
+    out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
+    out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+    out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+    out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+    out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+    out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+    out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+    out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
+
+    // Row 7
+    out7.s0 = w40.s0;
+    out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
+    out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
+    out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
+    out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
+    out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
+    out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
+    out7.s7 = w41;
+
+    int z  = get_global_id(2);
+    int x0 = z / NUM_CHANNELS; // idx filter
+    int y0 = z % NUM_CHANNELS; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+    // Store the 64 values across the 64 channels
+    *(__global float *)(dst_addr + 0 * dst_stride_z)  = out0.s0;
+    *(__global float *)(dst_addr + 1 * dst_stride_z)  = out0.s1;
+    *(__global float *)(dst_addr + 2 * dst_stride_z)  = out0.s2;
+    *(__global float *)(dst_addr + 3 * dst_stride_z)  = out0.s3;
+    *(__global float *)(dst_addr + 4 * dst_stride_z)  = out0.s4;
+    *(__global float *)(dst_addr + 5 * dst_stride_z)  = out0.s5;
+    *(__global float *)(dst_addr + 6 * dst_stride_z)  = out0.s6;
+    *(__global float *)(dst_addr + 7 * dst_stride_z)  = out0.s7;
+    *(__global float *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
+    *(__global float *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
+    *(__global float *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+    *(__global float *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+    *(__global float *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+    *(__global float *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+    *(__global float *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+    *(__global float *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+    *(__global float *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+    *(__global float *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+    *(__global float *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+    *(__global float *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+    *(__global float *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+    *(__global float *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+    *(__global float *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+    *(__global float *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+    *(__global float *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+    *(__global float *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+    *(__global float *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+    *(__global float *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+    *(__global float *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+    *(__global float *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+    *(__global float *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+    *(__global float *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+    *(__global float *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+    *(__global float *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+    *(__global float *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+    *(__global float *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+    *(__global float *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+    *(__global float *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+    *(__global float *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+    *(__global float *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+    *(__global float *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+    *(__global float *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+    *(__global float *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+    *(__global float *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+    *(__global float *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+    *(__global float *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+    *(__global float *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+    *(__global float *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+    *(__global float *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+    *(__global float *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+    *(__global float *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+    *(__global float *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+    *(__global float *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+    *(__global float *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+    *(__global float *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+    *(__global float *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+    *(__global float *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+    *(__global float *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+    *(__global float *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+    *(__global float *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+    *(__global float *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+    *(__global float *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+    *(__global float *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+    *(__global float *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+}
+#endif // defined(NUM_CHANNELS)
+
+#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP)
+/** This OpenCL kernel computes the input transform when the kernel size is 3x3 and the output tile is 2x2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_input_transform_2x2_3x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    // Compute input address
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * 2 * src_stride_x + y * 2 * src_stride_y + z * src_stride_z;
+
+    src_addr = src_addr - ((int)PAD_LEFT * src_stride_x) - ((int)PAD_TOP * src_stride_y);
+
+    float4 in_row0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+    float4 in_row1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+    float4 in_row2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+    float4 in_row3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+
+    float4 tmp0 = in_row0 - in_row2;
+    float4 tmp1 = in_row1 + in_row2;
+    float4 tmp2 = in_row2 - in_row1;
+    float4 tmp3 = in_row1 - in_row3;
+
+    float out00 = tmp0.s0 - tmp0.s2;
+    float out01 = tmp0.s1 + tmp0.s2;
+    float out02 = tmp0.s2 - tmp0.s1;
+    float out03 = tmp0.s1 - tmp0.s3;
+
+    float out10 = tmp1.s0 - tmp1.s2;
+    float out11 = tmp1.s1 + tmp1.s2;
+    float out12 = tmp1.s2 - tmp1.s1;
+    float out13 = tmp1.s1 - tmp1.s3;
+
+    float out20 = tmp2.s0 - tmp2.s2;
+    float out21 = tmp2.s1 + tmp2.s2;
+    float out22 = tmp2.s2 - tmp2.s1;
+    float out23 = tmp2.s1 - tmp2.s3;
+
+    float out30 = tmp3.s0 - tmp3.s2;
+    float out31 = tmp3.s1 + tmp3.s2;
+    float out32 = tmp3.s2 - tmp3.s1;
+    float out33 = tmp3.s1 - tmp3.s3;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * dst_stride_x + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+
+    *((__global float *)(dst_addr + 0 * dst_stride_z))  = out00;
+    *((__global float *)(dst_addr + 1 * dst_stride_z))  = out01;
+    *((__global float *)(dst_addr + 2 * dst_stride_z))  = out02;
+    *((__global float *)(dst_addr + 3 * dst_stride_z))  = out03;
+    *((__global float *)(dst_addr + 4 * dst_stride_z))  = out10;
+    *((__global float *)(dst_addr + 5 * dst_stride_z))  = out11;
+    *((__global float *)(dst_addr + 6 * dst_stride_z))  = out12;
+    *((__global float *)(dst_addr + 7 * dst_stride_z))  = out13;
+    *((__global float *)(dst_addr + 8 * dst_stride_z))  = out20;
+    *((__global float *)(dst_addr + 9 * dst_stride_z))  = out21;
+    *((__global float *)(dst_addr + 10 * dst_stride_z)) = out22;
+    *((__global float *)(dst_addr + 11 * dst_stride_z)) = out23;
+    *((__global float *)(dst_addr + 12 * dst_stride_z)) = out30;
+    *((__global float *)(dst_addr + 13 * dst_stride_z)) = out31;
+    *((__global float *)(dst_addr + 14 * dst_stride_z)) = out32;
+    *((__global float *)(dst_addr + 15 * dst_stride_z)) = out33;
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 3x3, the output tile is 2x2 and the number of channels is multiple of 2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2) * 2;
+
+    // Compute input address
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * 2 * src_stride_x + y * 2 * src_stride_y + z * src_stride_z;
+
+    src_addr = src_addr - ((int)PAD_LEFT * src_stride_x) - ((int)PAD_TOP * src_stride_y);
+
+    float4 in_row0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+    float4 in_row1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+    float4 in_row2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+    float4 in_row3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+
+    src_addr += src_stride_z;
+    float4 in_row4 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+    float4 in_row5 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+    float4 in_row6 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+    float4 in_row7 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+
+    float4 tmp0 = in_row0 - in_row2;
+    float4 tmp1 = in_row1 + in_row2;
+    float4 tmp2 = in_row2 - in_row1;
+    float4 tmp3 = in_row1 - in_row3;
+
+    float4 tmp4 = in_row4 - in_row6;
+    float4 tmp5 = in_row5 + in_row6;
+    float4 tmp6 = in_row6 - in_row5;
+    float4 tmp7 = in_row5 - in_row7;
+
+    float2 out00 = (float2)(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
+    float2 out01 = (float2)(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
+    float2 out02 = (float2)(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
+    float2 out03 = (float2)(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
+
+    float2 out10 = (float2)(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
+    float2 out11 = (float2)(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
+    float2 out12 = (float2)(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
+    float2 out13 = (float2)(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
+
+    float2 out20 = (float2)(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
+    float2 out21 = (float2)(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
+    float2 out22 = (float2)(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
+    float2 out23 = (float2)(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
+
+    float2 out30 = (float2)(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
+    float2 out31 = (float2)(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
+    float2 out32 = (float2)(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
+    float2 out33 = (float2)(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * dst_stride_x + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+
+    vstore2(out00, 0, (__global float *)(dst_addr + 0 * dst_stride_z));
+    vstore2(out01, 0, (__global float *)(dst_addr + 1 * dst_stride_z));
+    vstore2(out02, 0, (__global float *)(dst_addr + 2 * dst_stride_z));
+    vstore2(out03, 0, (__global float *)(dst_addr + 3 * dst_stride_z));
+    vstore2(out10, 0, (__global float *)(dst_addr + 4 * dst_stride_z));
+    vstore2(out11, 0, (__global float *)(dst_addr + 5 * dst_stride_z));
+    vstore2(out12, 0, (__global float *)(dst_addr + 6 * dst_stride_z));
+    vstore2(out13, 0, (__global float *)(dst_addr + 7 * dst_stride_z));
+    vstore2(out20, 0, (__global float *)(dst_addr + 8 * dst_stride_z));
+    vstore2(out21, 0, (__global float *)(dst_addr + 9 * dst_stride_z));
+    vstore2(out22, 0, (__global float *)(dst_addr + 10 * dst_stride_z));
+    vstore2(out23, 0, (__global float *)(dst_addr + 11 * dst_stride_z));
+    vstore2(out30, 0, (__global float *)(dst_addr + 12 * dst_stride_z));
+    vstore2(out31, 0, (__global float *)(dst_addr + 13 * dst_stride_z));
+    vstore2(out32, 0, (__global float *)(dst_addr + 14 * dst_stride_z));
+    vstore2(out33, 0, (__global float *)(dst_addr + 15 * dst_stride_z));
+}
+
+/** This OpenCL kernel computes the input transform when the output tile is 4x4, the filter size 3x3 and the data format is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    // Compute input address
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * 4 * src_stride_x + y * 4 * src_stride_y + z * src_stride_z;
+
+    src_addr = src_addr - ((int)PAD_LEFT * src_stride_x) - ((int)PAD_TOP * src_stride_y);
+
+    // Row4
+    float4 d40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));
+    float2 d41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y));
+
+    float k0 = d41.s0;
+    float k1 = d41.s0;
+    float k2 = d41.s0;
+    float k3 = d41.s0;
+    float k4 = d41.s0;
+    float k5 = 0.0f;
+
+    k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
+    k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
+    k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;
+    k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;
+    k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;
+    k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;
+
+    // Row0
+    float4 d00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+    float2 d01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y));
+
+    // Row2
+    float4 d20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+    float2 d21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y));
+
+    // Compute destination address
+    __global float *dst_addr = (__global float *)(dst_ptr + dst_offset_first_element_in_bytes + z * dst_stride_x + (x + y * (int)NUM_TILES_X) * dst_stride_y);
+
+    uint dst_plane_stride = dst_stride_z / sizeof(float);
+
+    float out0  = k0;
+    float out1  = k1;
+    float out2  = k2;
+    float out3  = k3;
+    float out4  = k4;
+    float out5  = k5;
+    float out6  = k0;
+    float out7  = k1;
+    float out8  = k2;
+    float out9  = k3;
+    float out10 = k4;
+    float out11 = k5;
+    float out12 = k0;
+    float out13 = k1;
+    float out14 = k2;
+    float out15 = k3;
+    float out16 = k4;
+    float out17 = k5;
+    float out18 = k0;
+    float out19 = k1;
+    float out20 = k2;
+    float out21 = k3;
+    float out22 = k4;
+    float out23 = k5;
+    float out24 = k0;
+    float out25 = k1;
+    float out26 = k2;
+    float out27 = k3;
+    float out28 = k4;
+    float out29 = k5;
+
+    // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
+    out0 += 16.0f * d00.s0 - 20.0f * d00.s2 - 20.0f * d20.s0 + 25.0f * d20.s2 + 4.0f * d01.s0 - 5.0f * d21.s0;
+    out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 + 4.0f * d01.s0 - 5.0f * d21.s0;
+    out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 - 20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 + 4.0f * d01.s0 - 5.0f * d21.s0;
+    out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 + 4.0f * d01.s0 - 5.0f * d21.s0;
+    out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 - 10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 + 4.0f * d01.s0 - 5.0f * d21.s0;
+    out5 += 16.0f * d00.s1 - 20.0f * d00.s3 - 20.0f * d20.s1 + 4.0f * d01.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;
+
+    *(dst_addr) = out0;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out1;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out2;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out3;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out4;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out5;
+    dst_addr += dst_plane_stride;
+
+    // Row1
+    float4 d10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+    float2 d11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y));
+
+    // Row3
+    float4 d30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+    float2 d31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y));
+
+    // Compute common parts for the channels between [6, 29]
+    // Channels [6, 11]:  [out10, out11, out12, out13, out14, out15]
+    // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
+    float part0  = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
+    float part1  = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
+    float part2  = 16.0f * d20.s2 - 4.0f * d21.s0;
+    float part3  = 16.0f * d20.s1 - 4.0f * d20.s3;
+    float part4  = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
+    float part5  = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
+    float part6  = 4.0f * d20.s2 - 4.0f * d21.s0;
+    float part7  = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
+    float part8  = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
+    float part9  = 8.0f * d20.s1 - 8.0f * d20.s3;
+    float part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
+    float part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
+
+    // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
+    // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
+    float part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
+    float part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
+    float part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
+    float part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
+    float part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
+    float part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
+    float part18 = part6 * 0.25f; // d20.s2 - d21.s0
+    float part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
+    float part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
+    float part21 = part9 * 0.25f;                                                 // 2.0f * (d20.s1 - d20.s3)
+    float part22 = part10 * 0.25f;                                                // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
+    float part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
+
+    out6 += part0 - part1;
+    out12 += part0 + part1;
+    out7 += part2 + part3 + part4 + part5;
+    out8 += part2 - part3 + part4 - part5;
+    out13 += part2 + part3 - part4 - part5;
+    out14 += part2 - part3 - part4 + part5;
+    out9 += part6 + part7 + part8 + part9;
+    out10 += part6 - part7 + part8 - part9;
+    out15 += part6 - part7 - part8 + part9;
+    out16 += part6 + part7 - part8 - part9;
+    out11 += part10 + part11;
+    out17 += part10 - part11;
+
+    out18 += part13 - part12;
+    out24 += part13 + part12;
+    out19 += part14 + part15 + part16 + part17;
+    out20 += part14 - part15 + part16 - part17;
+    out25 += part14 - part15 - part16 + part17;
+    out26 += part14 + part15 - part16 - part17;
+    out21 += part18 + part19 + part20 + part21;
+    out22 += part18 - part19 + part20 - part21;
+    out27 += part18 - part19 - part20 + part21;
+    out28 += part18 + part19 - part20 - part21;
+    out23 += part22 + part23;
+    out29 += part22 - part23;
+
+    *(dst_addr) = out6;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out7;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out8;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out9;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out10;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out11;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out12;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out13;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out14;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out15;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out16;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out17;
+    dst_addr += dst_plane_stride;
+
+    *(dst_addr) = out18;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out19;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out20;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out21;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out22;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out23;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out24;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out25;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out26;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out27;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out28;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out29;
+    dst_addr += dst_plane_stride;
+
+    // Row5
+    float4 d50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y));
+    float2 d51 = vload2(2, (__global float *)(src_addr + 5 * src_stride_y));
+
+    // Channels [30, 35]
+    out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;
+
+    *(dst_addr) = out0;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out1;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out2;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out3;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out4;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out5;
+    dst_addr += dst_plane_stride;
+}
+
+#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact)                     \
+    ({                                                              \
+        comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6;            \
+        comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5;            \
+        comm_fact.s2 = 2.5f * tmp.s3;                               \
+        comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
+        comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6;    \
+        comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4;        \
+        comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
+        \
+        out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
+        out.s1 = comm_fact.s0 + comm_fact.s1;                       \
+        out.s2 = comm_fact.s0 - comm_fact.s1;                       \
+        out.s3 = comm_fact.s3 + comm_fact.s4;                       \
+        out.s4 = comm_fact.s4 - comm_fact.s3;                       \
+        out.s5 = comm_fact.s5 + comm_fact.s6;                       \
+        out.s6 = comm_fact.s5 - comm_fact.s6;                       \
+        out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
+    })
+
+/** This OpenCL kernel computes the input transform when the kernel size is 5x5 and the output tile is 4x4
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    // Compute input address
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * 4 * src_stride_x + y * 4 * src_stride_y + z * src_stride_z;
+
+    src_addr = src_addr - ((int)PAD_LEFT * src_stride_x) - ((int)PAD_TOP * src_stride_y);
+
+    // Load 8x8 input tile
+    const float8 in_row0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y));
+    const float8 in_row1 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y));
+    const float8 in_row2 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y));
+    const float8 in_row3 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y));
+    const float8 in_row4 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y));
+    const float8 in_row5 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y));
+    const float8 in_row6 = vload8(0, (__global float *)(src_addr + 6 * src_stride_y));
+    const float8 in_row7 = vload8(0, (__global float *)(src_addr + 7 * src_stride_y));
+
+    // Calculate common factors for intermediate tensor
+    float8 comm_fact0 = in_row2 + in_row6 - 4.25f * in_row4;
+    float8 comm_fact1 = in_row1 + in_row5 - 4.25f * in_row3;
+    float8 comm_fact2 = 0.25f * in_row2 - 1.25f * in_row4 + in_row6;
+
+    // Calculate intermediate tensor and reuse common factor vectors
+    const float8 tmp0 = in_row0 - in_row6 + 5.25f * in_row4 - 5.25f * in_row2;
+    const float8 tmp1 = comm_fact0 + comm_fact1;
+    const float8 tmp2 = comm_fact0 - comm_fact1;
+
+    comm_fact0 = 2.5f * in_row3;
+    comm_fact1 = 0.5f * in_row1 - comm_fact0 + 2.f * in_row5;
+
+    const float8 tmp3 = comm_fact1 + comm_fact2;
+    const float8 tmp4 = comm_fact2 - comm_fact1;
+
+    comm_fact1 = 2.f * in_row1 - comm_fact0 + 0.5f * in_row5;
+    comm_fact2 = 4.f * in_row2 - 5.f * in_row4 + in_row6;
+
+    const float8 tmp5 = comm_fact1 + comm_fact2;
+    const float8 tmp6 = comm_fact2 - comm_fact1;
+    const float8 tmp7 = in_row7 - in_row1 + 5.25f * in_row3 - 5.25f * in_row5;
+
+    // Calculate output rows (reuse comm_fact0 vector)
+    float8 out0, out1, out2, out3, out4, out5, out6, out7;
+
+    OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
+
+    // Store values across the 64 channels
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * dst_stride_x + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+
+    *((__global float *)(dst_addr + 0 * dst_stride_z))  = out0.s0;
+    *((__global float *)(dst_addr + 1 * dst_stride_z))  = out0.s1;
+    *((__global float *)(dst_addr + 2 * dst_stride_z))  = out0.s2;
+    *((__global float *)(dst_addr + 3 * dst_stride_z))  = out0.s3;
+    *((__global float *)(dst_addr + 4 * dst_stride_z))  = out0.s4;
+    *((__global float *)(dst_addr + 5 * dst_stride_z))  = out0.s5;
+    *((__global float *)(dst_addr + 6 * dst_stride_z))  = out0.s6;
+    *((__global float *)(dst_addr + 7 * dst_stride_z))  = out0.s7;
+    *((__global float *)(dst_addr + 8 * dst_stride_z))  = out1.s0;
+    *((__global float *)(dst_addr + 9 * dst_stride_z))  = out1.s1;
+    *((__global float *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
+    *((__global float *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
+    *((__global float *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
+    *((__global float *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
+    *((__global float *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
+    *((__global float *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
+    *((__global float *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
+    *((__global float *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
+    *((__global float *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
+    *((__global float *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
+    *((__global float *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
+    *((__global float *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
+    *((__global float *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
+    *((__global float *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
+    *((__global float *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
+    *((__global float *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
+    *((__global float *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
+    *((__global float *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
+    *((__global float *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
+    *((__global float *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
+    *((__global float *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
+    *((__global float *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
+    *((__global float *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
+    *((__global float *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
+    *((__global float *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
+    *((__global float *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
+    *((__global float *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
+    *((__global float *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
+    *((__global float *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
+    *((__global float *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
+    *((__global float *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
+    *((__global float *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
+    *((__global float *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
+    *((__global float *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
+    *((__global float *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
+    *((__global float *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
+    *((__global float *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
+    *((__global float *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
+    *((__global float *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
+    *((__global float *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
+    *((__global float *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
+    *((__global float *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
+    *((__global float *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
+    *((__global float *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
+    *((__global float *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
+    *((__global float *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
+    *((__global float *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
+    *((__global float *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
+    *((__global float *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
+    *((__global float *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
+    *((__global float *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
+    *((__global float *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
+    *((__global float *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
+    *((__global float *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
+}
+#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP)
+
+#if defined(NUM_TILES_X)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2, the filter size 3x3 and the data format is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x2_3x3_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Each thread stores a 2x2 tile
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+
+    // Load the values across the 16 channels to compose the 4x4 tile
+    float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
+    float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
+    float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
+    float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
+
+    float d10 = *((__global float *)(src_addr + 4 * src_stride_z));
+    float d11 = *((__global float *)(src_addr + 5 * src_stride_z));
+    float d12 = *((__global float *)(src_addr + 6 * src_stride_z));
+    float d13 = *((__global float *)(src_addr + 7 * src_stride_z));
+
+    float d20 = *((__global float *)(src_addr + 8 * src_stride_z));
+    float d21 = *((__global float *)(src_addr + 9 * src_stride_z));
+    float d22 = *((__global float *)(src_addr + 10 * src_stride_z));
+    float d23 = *((__global float *)(src_addr + 11 * src_stride_z));
+
+    float d30 = *((__global float *)(src_addr + 12 * src_stride_z));
+    float d31 = *((__global float *)(src_addr + 13 * src_stride_z));
+    float d32 = *((__global float *)(src_addr + 14 * src_stride_z));
+    float d33 = *((__global float *)(src_addr + 15 * src_stride_z));
+
+    // Compute the 2x2 output tile
+    float k0 = d01 + d11 + d21;
+    float k1 = d02 + d12 + d22;
+    float k2 = d11 - d21 - d31;
+    float k3 = d12 - d22 - d32;
+
+    // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
+    // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
+    // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
+    // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
+
+    float out00 = d10;
+    float out01 = -d13;
+    float out10 = d10;
+    float out11 = -d13;
+
+    out00 += d00 + d20 + k0 + k1;
+    out01 += k0 - k1 - (d03 + d23);
+    out10 += -d20 - d30 + k2 + k3;
+    out11 += k2 - k3 + d23 + d33;
+
+    int y_in  = get_global_id(1);
+    int x_out = (y_in % NUM_TILES_X) * 2;
+    int y_out = (y_in / NUM_TILES_X) * 2;
+    int z_out = get_global_id(0);
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
+
+    out00 += (float)b;
+    out01 += (float)b;
+    out10 += (float)b;
+    out11 += (float)b;
+#endif // defined(HAS_BIAS)
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * dst_stride_x + y_out * dst_stride_y + z_out * dst_stride_z;
+
+    // Store the 2x2 output tile
+    vstore2((float2)(out00, out01), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
+    vstore2((float2)(out10, out11), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data format is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_3x3_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Each thread stores a 4x4 tile
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+
+    // Load the values across the 36 channels to compose the 6x6 tile
+    float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
+    float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
+    float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
+    float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
+    float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
+    float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
+
+    float d10 = *((__global float *)(src_addr + 6 * src_stride_z));
+    float d11 = *((__global float *)(src_addr + 7 * src_stride_z));
+    float d12 = *((__global float *)(src_addr + 8 * src_stride_z));
+    float d13 = *((__global float *)(src_addr + 9 * src_stride_z));
+    float d14 = *((__global float *)(src_addr + 10 * src_stride_z));
+    float d15 = *((__global float *)(src_addr + 11 * src_stride_z));
+
+    float d20 = *((__global float *)(src_addr + 12 * src_stride_z));
+    float d21 = *((__global float *)(src_addr + 13 * src_stride_z));
+    float d22 = *((__global float *)(src_addr + 14 * src_stride_z));
+    float d23 = *((__global float *)(src_addr + 15 * src_stride_z));
+    float d24 = *((__global float *)(src_addr + 16 * src_stride_z));
+    float d25 = *((__global float *)(src_addr + 17 * src_stride_z));
+
+    float d30 = *((__global float *)(src_addr + 18 * src_stride_z));
+    float d31 = *((__global float *)(src_addr + 19 * src_stride_z));
+    float d32 = *((__global float *)(src_addr + 20 * src_stride_z));
+    float d33 = *((__global float *)(src_addr + 21 * src_stride_z));
+    float d34 = *((__global float *)(src_addr + 22 * src_stride_z));
+    float d35 = *((__global float *)(src_addr + 23 * src_stride_z));
+
+    float d40 = *((__global float *)(src_addr + 24 * src_stride_z));
+    float d41 = *((__global float *)(src_addr + 25 * src_stride_z));
+    float d42 = *((__global float *)(src_addr + 26 * src_stride_z));
+    float d43 = *((__global float *)(src_addr + 27 * src_stride_z));
+    float d44 = *((__global float *)(src_addr + 28 * src_stride_z));
+    float d45 = *((__global float *)(src_addr + 29 * src_stride_z));
+
+    float d50 = *((__global float *)(src_addr + 30 * src_stride_z));
+    float d51 = *((__global float *)(src_addr + 31 * src_stride_z));
+    float d52 = *((__global float *)(src_addr + 32 * src_stride_z));
+    float d53 = *((__global float *)(src_addr + 33 * src_stride_z));
+    float d54 = *((__global float *)(src_addr + 34 * src_stride_z));
+    float d55 = *((__global float *)(src_addr + 35 * src_stride_z));
+
+    // Compute out00, out01, out02 and out03
+    float out00 = d01 + d21 + d41 + d11 + d31;
+    float out01 = d01 + d21 + d41 + d11 + d31;
+    float out02 = d01 + d21 + d41 + d11 + d31;
+    float out03 = d01 + d21 + d41 + d11 + d31;
+
+    float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
+    float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
+
+    out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
+    out01 += k1 - d02 - d12 - d22 - d32 - d42;
+    out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
+    out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
+
+    // Compute out10, out11, out12 and out13
+    float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+
+    k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
+    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
+
+    out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
+    out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
+    out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
+    out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
+
+    // Compute out20, out21, out22 and out23
+    float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+
+    k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
+    k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
+
+    out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
+    out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
+    out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
+    out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
+
+    // Compute out30, out31, out32 and out33
+    float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+
+    k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
+    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
+
+    out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
+    out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
+    out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
+    out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
+
+    int y_in  = get_global_id(1);
+    int x_out = (y_in % NUM_TILES_X) * 4;
+    int y_out = (y_in / NUM_TILES_X) * 4;
+    int z_out = get_global_id(0);
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
+
+    out00 += (float)b;
+    out01 += (float)b;
+    out02 += (float)b;
+    out03 += (float)b;
+
+    out10 += (float)b;
+    out11 += (float)b;
+    out12 += (float)b;
+    out13 += (float)b;
+
+    out20 += (float)b;
+    out21 += (float)b;
+    out22 += (float)b;
+    out23 += (float)b;
+
+    out30 += (float)b;
+    out31 += (float)b;
+    out32 += (float)b;
+    out33 += (float)b;
+
+#endif // defined(HAS_BIAS)
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * dst_stride_x + y_out * dst_stride_y + z_out * dst_stride_z;
+
+    // Store the 4x4 output tile
+    vstore4((float4)(out00, out01, out02, out03), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
+    vstore4((float4)(out10, out11, out12, out13), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
+    vstore4((float4)(out20, out21, out22, out23), 0, (__global float *)(dst_addr + 2 * dst_stride_y));
+    vstore4((float4)(out30, out31, out32, out33), 0, (__global float *)(dst_addr + 3 * dst_stride_y));
+}
+
+#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact)  \
+    ({                                                                   \
+        comm_fact.s0 = d1 + d2;                                          \
+        comm_fact.s1 = d3 + d4;                                          \
+        comm_fact.s2 = d5 + d6;                                          \
+        \
+        col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0;  \
+        col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
+        \
+        comm_fact.s0 = d1 - d2;                                          \
+        comm_fact.s1 = d3 - d4;                                          \
+        comm_fact.s2 = d5 - d6;                                          \
+        \
+        col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
+        col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7;  \
+    })
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 5x5 and the data format is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_5x5_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Each thread stores a 4x4 tile
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+
+    // Load the values across the 64 channels to compose the 8x8 input tile
+    float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
+    float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
+    float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
+    float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
+    float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
+    float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
+    float d06 = *((__global float *)(src_addr + 6 * src_stride_z));
+    float d07 = *((__global float *)(src_addr + 7 * src_stride_z));
+
+    float d10 = *((__global float *)(src_addr + 8 * src_stride_z));
+    float d11 = *((__global float *)(src_addr + 9 * src_stride_z));
+    float d12 = *((__global float *)(src_addr + 10 * src_stride_z));
+    float d13 = *((__global float *)(src_addr + 11 * src_stride_z));
+    float d14 = *((__global float *)(src_addr + 12 * src_stride_z));
+    float d15 = *((__global float *)(src_addr + 13 * src_stride_z));
+    float d16 = *((__global float *)(src_addr + 14 * src_stride_z));
+    float d17 = *((__global float *)(src_addr + 15 * src_stride_z));
+
+    float d20 = *((__global float *)(src_addr + 16 * src_stride_z));
+    float d21 = *((__global float *)(src_addr + 17 * src_stride_z));
+    float d22 = *((__global float *)(src_addr + 18 * src_stride_z));
+    float d23 = *((__global float *)(src_addr + 19 * src_stride_z));
+    float d24 = *((__global float *)(src_addr + 20 * src_stride_z));
+    float d25 = *((__global float *)(src_addr + 21 * src_stride_z));
+    float d26 = *((__global float *)(src_addr + 22 * src_stride_z));
+    float d27 = *((__global float *)(src_addr + 23 * src_stride_z));
+
+    float d30 = *((__global float *)(src_addr + 24 * src_stride_z));
+    float d31 = *((__global float *)(src_addr + 25 * src_stride_z));
+    float d32 = *((__global float *)(src_addr + 26 * src_stride_z));
+    float d33 = *((__global float *)(src_addr + 27 * src_stride_z));
+    float d34 = *((__global float *)(src_addr + 28 * src_stride_z));
+    float d35 = *((__global float *)(src_addr + 29 * src_stride_z));
+    float d36 = *((__global float *)(src_addr + 30 * src_stride_z));
+    float d37 = *((__global float *)(src_addr + 31 * src_stride_z));
+
+    float d40 = *((__global float *)(src_addr + 32 * src_stride_z));
+    float d41 = *((__global float *)(src_addr + 33 * src_stride_z));
+    float d42 = *((__global float *)(src_addr + 34 * src_stride_z));
+    float d43 = *((__global float *)(src_addr + 35 * src_stride_z));
+    float d44 = *((__global float *)(src_addr + 36 * src_stride_z));
+    float d45 = *((__global float *)(src_addr + 37 * src_stride_z));
+    float d46 = *((__global float *)(src_addr + 38 * src_stride_z));
+    float d47 = *((__global float *)(src_addr + 39 * src_stride_z));
+
+    float d50 = *((__global float *)(src_addr + 40 * src_stride_z));
+    float d51 = *((__global float *)(src_addr + 41 * src_stride_z));
+    float d52 = *((__global float *)(src_addr + 42 * src_stride_z));
+    float d53 = *((__global float *)(src_addr + 43 * src_stride_z));
+    float d54 = *((__global float *)(src_addr + 44 * src_stride_z));
+    float d55 = *((__global float *)(src_addr + 45 * src_stride_z));
+    float d56 = *((__global float *)(src_addr + 46 * src_stride_z));
+    float d57 = *((__global float *)(src_addr + 47 * src_stride_z));
+
+    float d60 = *((__global float *)(src_addr + 48 * src_stride_z));
+    float d61 = *((__global float *)(src_addr + 49 * src_stride_z));
+    float d62 = *((__global float *)(src_addr + 50 * src_stride_z));
+    float d63 = *((__global float *)(src_addr + 51 * src_stride_z));
+    float d64 = *((__global float *)(src_addr + 52 * src_stride_z));
+    float d65 = *((__global float *)(src_addr + 53 * src_stride_z));
+    float d66 = *((__global float *)(src_addr + 54 * src_stride_z));
+    float d67 = *((__global float *)(src_addr + 55 * src_stride_z));
+
+    float d70 = *((__global float *)(src_addr + 56 * src_stride_z));
+    float d71 = *((__global float *)(src_addr + 57 * src_stride_z));
+    float d72 = *((__global float *)(src_addr + 58 * src_stride_z));
+    float d73 = *((__global float *)(src_addr + 59 * src_stride_z));
+    float d74 = *((__global float *)(src_addr + 60 * src_stride_z));
+    float d75 = *((__global float *)(src_addr + 61 * src_stride_z));
+    float d76 = *((__global float *)(src_addr + 62 * src_stride_z));
+    float d77 = *((__global float *)(src_addr + 63 * src_stride_z));
+
+    // Compute the 8x4 intermediate tensor
+    float4 comm_fact0, comm_fact1, comm_fact2;
+    float4 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
+
+    COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
+
+    // Compute the 4x4 output tile
+    comm_fact0 = tmp_col1 + tmp_col2;
+    comm_fact1 = tmp_col3 + tmp_col4;
+    comm_fact2 = tmp_col5 + tmp_col6;
+
+    float4 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
+    float4 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
+
+    comm_fact0 = tmp_col1 - tmp_col2;
+    comm_fact1 = tmp_col3 - tmp_col4;
+    comm_fact2 = tmp_col5 - tmp_col6;
+
+    float4 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
+    float4 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
+
+    int y_in  = get_global_id(1);
+    int x_out = (y_in % NUM_TILES_X) * 4;
+    int y_out = (y_in / NUM_TILES_X) * 4;
+    int z_out = get_global_id(0);
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
+
+    out_col0 += (float4)b;
+    out_col1 += (float4)b;
+    out_col2 += (float4)b;
+    out_col3 += (float4)b;
+#endif // defined(HAS_BIAS)
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * dst_stride_x + y_out * dst_stride_y + z_out * dst_stride_z;
+
+    // Store the 4x4 output tile
+    *(__global float *)(dst_addr + 0 * dst_stride_x + 0 * dst_stride_y) = out_col0.s0;
+    *(__global float *)(dst_addr + 1 * dst_stride_x + 0 * dst_stride_y) = out_col1.s0;
+    *(__global float *)(dst_addr + 2 * dst_stride_x + 0 * dst_stride_y) = out_col2.s0;
+    *(__global float *)(dst_addr + 3 * dst_stride_x + 0 * dst_stride_y) = out_col3.s0;
+    *(__global float *)(dst_addr + 0 * dst_stride_x + 1 * dst_stride_y) = out_col0.s1;
+    *(__global float *)(dst_addr + 1 * dst_stride_x + 1 * dst_stride_y) = out_col1.s1;
+    *(__global float *)(dst_addr + 2 * dst_stride_x + 1 * dst_stride_y) = out_col2.s1;
+    *(__global float *)(dst_addr + 3 * dst_stride_x + 1 * dst_stride_y) = out_col3.s1;
+    *(__global float *)(dst_addr + 0 * dst_stride_x + 2 * dst_stride_y) = out_col0.s2;
+    *(__global float *)(dst_addr + 1 * dst_stride_x + 2 * dst_stride_y) = out_col1.s2;
+    *(__global float *)(dst_addr + 2 * dst_stride_x + 2 * dst_stride_y) = out_col2.s2;
+    *(__global float *)(dst_addr + 3 * dst_stride_x + 2 * dst_stride_y) = out_col3.s2;
+    *(__global float *)(dst_addr + 0 * dst_stride_x + 3 * dst_stride_y) = out_col0.s3;
+    *(__global float *)(dst_addr + 1 * dst_stride_x + 3 * dst_stride_y) = out_col1.s3;
+    *(__global float *)(dst_addr + 2 * dst_stride_x + 3 * dst_stride_y) = out_col2.s3;
+    *(__global float *)(dst_addr + 3 * dst_stride_x + 3 * dst_stride_y) = out_col3.s3;
+}
+#endif // defined(NUM_TILES_X)

diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 87fc1d0..293361b 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp

@@ -46,10 +46,23 @@
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var, beta, gamma);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var, beta, gamma);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != mean->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
+    if(beta != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, beta);
+    }
+    if(gamma != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, gamma);
+    }
+
     if(act_info.enabled())
     {
         ActivationLayerInfo::ActivationFunction act = act_info.activation();
@@ -62,6 +75,7 @@
     if(output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     }
@@ -69,7 +83,8 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+                                                        ITensorInfo *mean, ITensorInfo *var, ITensorInfo *beta, ITensorInfo *gamma)
 {
     if(output != nullptr)
     {
@@ -95,6 +110,24 @@
         window_changed = update_window_and_padding(win, input_access);
     }
 
+    if(input->data_layout() == DataLayout::NHWC)
+    {
+        AccessWindowHorizontal mean_access(mean, 0, num_elems_processed_per_iteration);
+        AccessWindowHorizontal var_access(var, 0, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win, mean_access, var_access);
+
+        if(beta != nullptr)
+        {
+            AccessWindowHorizontal beta_access(beta, 0, num_elems_processed_per_iteration);
+            window_changed = window_changed || update_window_and_padding(win, beta_access);
+        }
+        if(gamma != nullptr)
+        {
+            AccessWindowHorizontal gamma_access(gamma, 0, num_elems_processed_per_iteration);
+            window_changed = window_changed || update_window_and_padding(win, gamma_access);
+        }
+    }
+
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
@@ -108,7 +141,7 @@
 void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
                                                 float epsilon, ActivationLayerInfo act_info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var, beta, gamma);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
 
     _input   = input;
     _output  = output;
@@ -120,15 +153,9 @@
 
     _run_in_place = (output == nullptr) || (output == input);
 
-    if(output != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), output->info());
-        // Output tensor auto initialization if not yet initialized
-        auto_init_if_empty(*output->info(), *input->info()->clone());
-    }
-
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr,
-                                                  mean->info(), var->info(), beta->info(), gamma->info(), epsilon, act_info));
+                                                  mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
+                                                  (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
 
     const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
 
@@ -136,26 +163,41 @@
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option_if(act_info.enabled(), "-D" + string_from_activation_func(act_info.activation()));
+    build_opts.add_option_if(act_info.enabled(), "-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
     build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
     build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
     build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
     build_opts.add_option_if(is_data_type_fixed_point(input->info()->data_type()), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    build_opts.add_option_if(beta == nullptr, "-DUSE_DEFAULT_BETA");
+    build_opts.add_option_if(gamma == nullptr, "-DUSE_DEFAULT_GAMMA");
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("batchnormalization_layer", build_opts.options()));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()));
 
     // Set kernel static arguments
     unsigned int include_output = (!_run_in_place) ? 1 : 0;
-    unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor() + 4 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+    unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor() + 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+    if(_beta != nullptr)
+    {
+        idx += num_arguments_per_1D_tensor(); // Skip beta parameter
+    }
+    if(_gamma != nullptr)
+    {
+        idx += num_arguments_per_1D_tensor(); // Skip gamma parameter
+    }
     _kernel.setArg<cl_float>(idx++, _epsilon);
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
+    auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info(),
+                                                    mean->info(), var->info(),
+                                                    (beta != nullptr) ? beta->info() : nullptr,
+                                                    (gamma != nullptr) ? gamma->info() : nullptr);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure(win_config.second);
 
     _config_id = "batch_normalization_layer_";
+    _config_id += string_from_data_layout(input->info()->data_layout());
+    _config_id += "_";
     _config_id += string_from_data_type(input->info()->data_type());
     _config_id += "_";
     _config_id += support::cpp11::to_string(input->info()->dimension(0));
@@ -172,7 +214,11 @@
 {
     const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(),
+                                                              mean->clone().get(), var->clone().get(),
+                                                              (beta != nullptr) ? beta->clone().get() : nullptr,
+                                                              (gamma != nullptr) ? gamma->clone().get() : nullptr)
+                                .first);
 
     return Status{};
 }
@@ -191,8 +237,14 @@
     unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor();
     add_1D_tensor_argument(idx, _mean, vector_slice);
     add_1D_tensor_argument(idx, _var, vector_slice);
-    add_1D_tensor_argument(idx, _beta, vector_slice);
-    add_1D_tensor_argument(idx, _gamma, vector_slice);
+    if(_beta != nullptr)
+    {
+        add_1D_tensor_argument(idx, _beta, vector_slice);
+    }
+    if(_gamma != nullptr)
+    {
+        add_1D_tensor_argument(idx, _gamma, vector_slice);
+    }
 
     do
     {

diff --git a/src/core/CL/kernels/CLChannelCombineKernel.cpp b/src/core/CL/kernels/CLChannelCombineKernel.cpp
index d729ebc..6e55e66 100644
--- a/src/core/CL/kernels/CLChannelCombineKernel.cpp
+++ b/src/core/CL/kernels/CLChannelCombineKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,41 +48,62 @@
 
 void CLChannelCombineKernel::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
+    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422);
 
-    const Format fmt = output->info()->format();
-    _planes[0]       = plane0;
-    _planes[1]       = plane1;
-    _planes[2]       = plane2;
-    if(Format::RGBA8888 == fmt)
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
+
+    const Format output_format = output->info()->format();
+
+    // Check if horizontal dimension of Y plane is even and validate horizontal sub-sampling dimensions for U and V planes
+    if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
     {
+        // Validate Y plane of input and output
+        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output);
+
+        // Validate U and V plane of the input
+        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
+    }
+
+    _planes[0] = plane0;
+    _planes[1] = plane1;
+    _planes[2] = plane2;
+    _planes[3] = nullptr;
+
+    // Validate the last input tensor only for RGBA format
+    if(Format::RGBA8888 == output_format)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(plane3);
+        ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane3);
+
         ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane3, Format::U8);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane3, 1, DataType::U8);
+
         _planes[3] = plane3;
     }
-    else
-    {
-        _planes[3] = nullptr;
-    }
+
     _output       = output;
     _output_multi = nullptr;
 
-    // Half the processed elements for U,V channels due to sub-sampling of 2
-    if(Format::YUYV422 == fmt || Format::UYVY422 == fmt)
+    // Half the processed elements for U and V channels due to horizontal sub-sampling of 2
+    if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
     {
-        _x_subsampling = { { 1, 2, 2 } };
-        _y_subsampling = { { 1, 2, 2 } };
-    }
-    else
-    {
-        _x_subsampling = { { 1, 1, 1 } };
-        _y_subsampling = { { 1, 1, 1 } };
+        _x_subsampling[1] = 2;
+        _x_subsampling[2] = 2;
     }
 
     // Create kernel
-    std::string kernel_name = "channel_combine_" + string_from_format(fmt);
+    std::string kernel_name = "channel_combine_" + string_from_format(output_format);
     _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
 
     // Configure window
@@ -112,50 +133,78 @@
 
 void CLChannelCombineKernel::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
+
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
 
-    _planes[0]           = plane0;
-    _planes[1]           = plane1;
-    _planes[2]           = plane2;
-    _planes[3]           = nullptr;
-    _output              = nullptr;
-    _output_multi        = output;
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
+
+    const Format output_format = output->info()->format();
+
+    // Validate shape of Y plane to be even and shape of sub-sampling dimensions for U and V planes
+    // Perform validation only for formats which require sub-sampling.
+    if(Format::YUV444 != output_format)
+    {
+        // Validate Y plane of input and output
+        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output->plane(0));
+
+        // Validate U and V plane of the input
+        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
+
+        // Validate second plane U (NV12 and NV21 have a UV88 combined plane while IYUV has only the U plane)
+        // MultiImage generates the correct tensor shape but also check in case the tensor shape of planes was changed to a wrong size
+        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(1));
+
+        // Validate the last plane V of format IYUV
+        if(Format::IYUV == output_format)
+        {
+            // Validate Y plane of the output
+            ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(2));
+        }
+    }
+
+    // Set input tensors
+    _planes[0] = plane0;
+    _planes[1] = plane1;
+    _planes[2] = plane2;
+    _planes[3] = nullptr;
+
+    // Set output tensor
+    _output       = nullptr;
+    _output_multi = output;
+
     bool has_two_planars = false;
 
     // Set sub-sampling parameters for each plane
-    const Format          fmt = output->info()->format();
     std::string           kernel_name;
     std::set<std::string> build_opts;
 
-    if(Format::NV12 == fmt || Format::NV21 == fmt)
+    if(Format::NV12 == output_format || Format::NV21 == output_format)
     {
         _x_subsampling = { { 1, 2, 2 } };
         _y_subsampling = { { 1, 2, 2 } };
         kernel_name    = "channel_combine_NV";
-        build_opts.emplace(Format::NV12 == fmt ? "-DNV12" : "-DNV21");
+        build_opts.emplace(Format::NV12 == output_format ? "-DNV12" : "-DNV21");
         has_two_planars = true;
     }
     else
     {
-        if(Format::IYUV == fmt)
+        if(Format::IYUV == output_format)
         {
             _x_subsampling = { { 1, 2, 2 } };
             _y_subsampling = { { 1, 2, 2 } };
         }
-        else
-        {
-            _x_subsampling = { { 1, 1, 1 } };
-            _y_subsampling = { { 1, 1, 1 } };
-        }
 
         kernel_name = "copy_planes_3p";
-        build_opts.emplace(Format::IYUV == fmt ? "-DIYUV" : "-DYUV444");
+        build_opts.emplace(Format::IYUV == output_format ? "-DIYUV" : "-DYUV444");
     }
 
     // Create kernel
@@ -166,12 +215,12 @@
 
     Window win = calculate_max_window(*plane0->info(), Steps(num_elems_processed_per_iteration));
 
-    AccessWindowHorizontal input_plane0_access(plane0->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowRectangle  input_plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
-    AccessWindowRectangle  input_plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
-    AccessWindowRectangle  output_plane0_access(output->plane(0)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f, 1.f / _y_subsampling[1]);
-    AccessWindowRectangle  output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
-    AccessWindowRectangle  output_plane2_access(has_two_planars ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+    AccessWindowRectangle input_plane0_access(plane0->info(), 0, 0, num_elems_processed_per_iteration, 1.f);
+    AccessWindowRectangle input_plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+    AccessWindowRectangle input_plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+    AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f, 1.f / _y_subsampling[1]);
+    AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+    AccessWindowRectangle output_plane2_access(has_two_planars ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1.f, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
 
     update_window_and_padding(win,
                               input_plane0_access, input_plane1_access, input_plane2_access,
@@ -192,6 +241,7 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
     Window slice = window.first_slice_window_2D();
+    slice.set_dimension_step(Window::DimY, 1);
 
     do
     {

diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
new file mode 100644
index 0000000..a667119
--- /dev/null
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp

@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16, DataType::QS16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
+
+    const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+    // There cannot be more groups than channels
+    ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, *input->clone());
+
+    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / input->element_size();
+
+    // Configure kernel window
+    Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
+    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+    AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+
+    const bool window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, input->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLChannelShuffleLayerKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    _input  = input;
+    _output = output;
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
+
+    const unsigned int channels   = input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL));
+    const unsigned int block_size = max_cl_vector_width / input->info()->element_size();
+
+    // Set kernel build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
+    build_opts.add_option("-DK=" + support::cpp11::to_string(channels / num_groups));
+    build_opts.add_option("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+    switch(input->info()->element_size())
+    {
+        case 1:
+            build_opts.add_option("-DDATA_TYPE=uchar");
+            break;
+        case 2:
+            build_opts.add_option("-DDATA_TYPE=ushort");
+            break;
+        case 4:
+            build_opts.add_option("-DDATA_TYPE=uint");
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("channel_shuffle_nchw", build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+}
+
+Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+
+    return Status{};
+}
+
+void CLChannelShuffleLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index 8ccec06..91c0430 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp

@@ -51,7 +51,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
 
     return Status{};
@@ -111,8 +111,8 @@
 
     // Configure the local work size for Bifrost with a value obtained
     // via exhaustive autotuning over 30 representative tensor shapes.
-    const GPUTarget gpu_target = get_arch_from_target(get_target());
-    if(gpu_target == GPUTarget::BIFROST)
+    const GPUTarget gpu_target = get_target();
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
     {
         if((_convolved_dims.first == 7) || (_convolved_dims.first == 14))
         {

diff --git a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
new file mode 100644
index 0000000..1b211b0
--- /dev/null
+++ b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp

@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+CLConvertFullyConnectedWeightsKernel::CLConvertFullyConnectedWeightsKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLConvertFullyConnectedWeightsKernel::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
+                                                     DataLayout data_layout)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLConvertFullyConnectedWeightsKernel::validate(input->info(), output->info(), original_input_shape, data_layout));
+
+    _input  = input;
+    _output = output;
+
+    const unsigned int num_elems_per_input_plane = original_input_shape.x() * original_input_shape.y();
+    const unsigned int num_channels              = original_input_shape.z();
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    if(data_layout == DataLayout::NCHW)
+    {
+        build_opts.add_option("-DFACTOR_1=" + support::cpp11::to_string(num_elems_per_input_plane));
+        build_opts.add_option("-DFACTOR_2=" + support::cpp11::to_string(num_channels));
+    }
+    else
+    {
+        build_opts.add_option("-DFACTOR_1=" + support::cpp11::to_string(num_channels));
+        build_opts.add_option("-DFACTOR_2=" + support::cpp11::to_string(num_elems_per_input_plane));
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convert_fc_weights", build_opts.options()));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+    ICLKernel::configure(win);
+}
+
+Status CLConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
+                                                      DataLayout data_layout)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32,
+                                                         DataType::QS32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != original_input_shape.total_size_lower(3));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
+
+    return Status{};
+}
+
+void CLConvertFullyConnectedWeightsKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _input, window);
+    add_2D_tensor_argument(idx, _output, window);
+    enqueue(queue, *this, window);
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLCopyKernel.cpp b/src/core/CL/kernels/CLCopyKernel.cpp
new file mode 100644
index 0000000..4f00ef9
--- /dev/null
+++ b/src/core/CL/kernels/CLCopyKernel.cpp

@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+
+using namespace arm_compute;
+
+CLCopyKernel::CLCopyKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLCopyKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(input->info()->tensor_shape(), output->info()->tensor_shape());
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("copy_tensor", build_opts.options()));
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    ICLKernel::configure(win);
+}
+
+void CLCopyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimX);
+    Window slice     = collapsed.first_slice_window_1D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_1D_tensor_argument(idx, _input, slice);
+        add_1D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(collapsed.slide_window_slice_1D(slice));
+}

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
deleted file mode 100644
index 29564b3..0000000
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
+++ /dev/null

@@ -1,260 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-CLDepthwiseConvolutionLayer3x3Kernel::CLDepthwiseConvolutionLayer3x3Kernel()
-    : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_left(0), _conv_pad_top(0)
-{
-}
-
-BorderSize CLDepthwiseConvolutionLayer3x3Kernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
-
-    bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type());
-
-    if(biases != nullptr)
-    {
-        if(is_qasymm)
-        {
-            ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        }
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(2));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
-    }
-
-    // Get convolved dimensions
-    const TensorShape output_shape = compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(),
-                       output_shape,
-                       1,
-                       input->info()->data_type(),
-                       input->info()->fixed_point_position(),
-                       input->info()->quantization_info());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
-    _input         = input;
-    _output        = output;
-    _weights       = weights;
-    _biases        = biases;
-    _conv_stride_x = conv_info.stride().first;
-    _conv_stride_y = conv_info.stride().second;
-    _conv_pad_left = conv_info.pad_left();
-    _conv_pad_top  = conv_info.pad_top();
-    _border_size   = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left);
-
-    // Set build options
-    ARM_COMPUTE_ERROR_ON(_conv_stride_x < 1 || _conv_stride_x > 3);
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
-    build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
-
-    if(is_qasymm)
-    {
-        float multiplier        = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale;
-        int   output_multiplier = 0;
-        int   output_shift      = 0;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-
-        build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
-        build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-_input->info()->quantization_info().offset));
-        build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-_weights->info()->quantization_info().offset));
-        build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(_output->info()->quantization_info().offset));
-        build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * input->info()->quantization_info().offset * weights->info()->quantization_info().offset));
-        build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-        build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-    }
-
-    // Configure the local work size for Bifrost with a value obtained
-    // via exhaustive autotuning for the MobileNets tensor shapes.
-    const GPUTarget gpu_target = get_arch_from_target(get_target());
-
-    // Configure kernel window
-    unsigned int num_elems_read_per_iteration_x    = 0;
-    unsigned int num_elems_read_per_iteration_y    = 0;
-    unsigned int num_elems_written_per_iteration_x = 0;
-    unsigned int num_elems_written_per_iteration_y = 0;
-
-    // Create kernel
-    std::string kernel_name;
-
-    if(input->info()->data_type() == DataType::F16)
-    {
-        kernel_name                       = "depthwise_convolution_3x3_f16";
-        num_elems_written_per_iteration_x = 8 / data_size_from_type(input->info()->data_type());
-        num_elems_written_per_iteration_y = 1;
-        num_elems_read_per_iteration_y    = 3;
-        switch(_conv_stride_x)
-        {
-            case 1:
-                num_elems_read_per_iteration_x = 8;
-                break;
-            case 2:
-                num_elems_read_per_iteration_x = 9;
-                break;
-            case 3:
-                num_elems_read_per_iteration_x = 16;
-                break;
-            default:
-                num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * _conv_stride_x;
-                break;
-        }
-    }
-    else if(input->info()->data_type() == DataType::F32 && gpu_target == GPUTarget::BIFROST)
-    {
-        if(_conv_stride_x == 1 && _conv_stride_y == 1)
-        {
-            kernel_name                       = "depthwise_convolution_3x3_stridex1_stridey1_bifrost";
-            num_elems_read_per_iteration_x    = 4;
-            num_elems_read_per_iteration_y    = 6;
-            num_elems_written_per_iteration_x = 2;
-            num_elems_written_per_iteration_y = 4;
-        }
-        else if(_conv_stride_x == 2 && _conv_stride_y == 2)
-        {
-            kernel_name                       = "depthwise_convolution_3x3_stridex2_stridey2_bifrost";
-            num_elems_read_per_iteration_x    = 6;
-            num_elems_read_per_iteration_y    = 5;
-            num_elems_written_per_iteration_x = 2;
-            num_elems_written_per_iteration_y = 2;
-        }
-        else
-        {
-            kernel_name                       = "depthwise_convolution_3x3";
-            num_elems_written_per_iteration_x = 8 / data_size_from_type(input->info()->data_type());
-            num_elems_written_per_iteration_y = 1;
-            num_elems_read_per_iteration_x    = 3 + (num_elems_written_per_iteration_x - 1) * _conv_stride_x;
-            num_elems_read_per_iteration_y    = 3;
-        }
-    }
-    else
-    {
-        kernel_name                       = is_qasymm ? "depthwise_convolution_3x3_quantized" : "depthwise_convolution_3x3";
-        num_elems_written_per_iteration_x = 8 / data_size_from_type(input->info()->data_type());
-        num_elems_written_per_iteration_y = (is_qasymm && _conv_stride_y < 3) ? (2 / _conv_stride_y) : 1;
-        num_elems_read_per_iteration_x    = 3 + (num_elems_written_per_iteration_x - 1) * _conv_stride_x;
-        num_elems_read_per_iteration_y    = num_elems_written_per_iteration_y + 2;
-    }
-
-    // Create window and update padding
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
-    AccessWindowRectangle input_access(input->info(), -_conv_pad_left, -_conv_pad_top,
-                                       num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
-                                       _conv_stride_x, _conv_stride_y);
-    AccessWindowStatic    weights_access(weights->info(), 0, 0, 3, 3);
-    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
-
-    update_window_and_padding(win, input_access, weights_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure(win);
-
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-
-    // Set config_id for enabling LWS tuning
-    _config_id = kernel_name;
-    _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-}
-
-void CLDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    // Create input window and adjust
-    Window win_in = window;
-    win_in.adjust(Window::DimX, -_conv_pad_left, true);
-    win_in.adjust(Window::DimY, -_conv_pad_top, true);
-    win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
-    win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
-
-    Window slice_in      = win_in.first_slice_window_3D();
-    Window slice_out     = window.first_slice_window_3D();
-    Window slice_weights = window.first_slice_window_3D();
-    slice_weights.set_dimension_step(Window::DimX, 0);
-    slice_weights.set_dimension_step(Window::DimY, 0);
-
-    // Set biases
-    if(_biases != nullptr)
-    {
-        unsigned int idx = 3 * num_arguments_per_3D_tensor();
-        Window       slice_biases;
-        slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
-        add_1D_tensor_argument(idx, _biases, slice_biases);
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_3D_tensor_argument(idx, _output, slice_out);
-        add_3D_tensor_argument(idx, _weights, slice_weights);
-
-        enqueue(queue, *this, slice_out, _lws_hint);
-    }
-    while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
-}

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
new file mode 100644
index 0000000..e4ad97f
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp

@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+                          const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && ((input->data_type() != DataType::QASYMM8) || ((act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                                                                                                         && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                                                                                                         && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU))),
+                                    "For QASYMM8 only relu, lower bounded relu and lower-upper bounded relu are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != 3 || weights->dimension(1) != 3);
+    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(2) * depth_multiplier) != output->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3);
+
+    const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
+
+    if(biases != nullptr)
+    {
+        if(is_qasymm)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(2));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    if(output->total_size() != 0)
+    {
+        const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+                                                        GPUTarget gpu_target, std::string &kernel_name)
+{
+    // Output auto inizialitation if not yet initialized
+    const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
+
+    const unsigned int conv_stride_x = conv_info.stride().first;
+    const unsigned int conv_stride_y = conv_info.stride().second;
+    const bool         is_qasymm     = is_data_type_quantized_asymmetric(input->data_type());
+    const bool         is_bifrost    = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
+
+    // Configure kernel window
+    unsigned int num_elems_read_per_iteration_x    = 0;
+    unsigned int num_elems_read_per_iteration_y    = 0;
+    unsigned int num_elems_written_per_iteration_x = 0;
+    unsigned int num_elems_written_per_iteration_y = 0;
+
+    if(input->data_type() == DataType::F16)
+    {
+        kernel_name                       = "depthwise_convolution_3x3_f16";
+        num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
+        num_elems_written_per_iteration_y = 1;
+        num_elems_read_per_iteration_y    = 3;
+        switch(conv_stride_x)
+        {
+            case 1:
+                num_elems_read_per_iteration_x = 8;
+                break;
+            case 2:
+                num_elems_read_per_iteration_x = 9;
+                break;
+            case 3:
+                num_elems_read_per_iteration_x = 16;
+                break;
+            default:
+                num_elems_read_per_iteration_x = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
+                break;
+        }
+        if(is_bifrost)
+        {
+            if(conv_stride_x == 1 && conv_stride_y == 1)
+            {
+                kernel_name                       = "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16";
+                num_elems_read_per_iteration_x    = 8;
+                num_elems_written_per_iteration_x = 4;
+                num_elems_read_per_iteration_y    = 6;
+                num_elems_written_per_iteration_y = 4;
+            }
+            else if(conv_stride_x == 2 && conv_stride_y == 2)
+            {
+                kernel_name                       = "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16";
+                num_elems_read_per_iteration_x    = 10;
+                num_elems_written_per_iteration_x = 4;
+                num_elems_read_per_iteration_y    = 5;
+                num_elems_written_per_iteration_y = 2;
+            }
+        }
+    }
+    else if(input->data_type() == DataType::F32 && is_bifrost)
+    {
+        if(conv_stride_x == 1 && conv_stride_y == 1)
+        {
+            kernel_name                       = "depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32";
+            num_elems_read_per_iteration_x    = 4;
+            num_elems_read_per_iteration_y    = 6;
+            num_elems_written_per_iteration_x = 2;
+            num_elems_written_per_iteration_y = 4;
+        }
+        else if(conv_stride_x == 2 && conv_stride_y == 2)
+        {
+            kernel_name                       = "depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32";
+            num_elems_read_per_iteration_x    = 6;
+            num_elems_read_per_iteration_y    = 5;
+            num_elems_written_per_iteration_x = 2;
+            num_elems_written_per_iteration_y = 2;
+        }
+        else
+        {
+            kernel_name                       = "depthwise_convolution_3x3";
+            num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
+            num_elems_written_per_iteration_y = 1;
+            num_elems_read_per_iteration_x    = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
+            num_elems_read_per_iteration_y    = 3;
+        }
+    }
+    else
+    {
+        kernel_name                       = is_qasymm ? "depthwise_convolution_3x3_quantized_nchw" : "depthwise_convolution_3x3";
+        num_elems_written_per_iteration_x = 8 / data_size_from_type(input->data_type());
+        num_elems_written_per_iteration_y = (is_qasymm && conv_stride_y < 3) ? (2 / conv_stride_y) : 1;
+        num_elems_read_per_iteration_x    = 3 + (num_elems_written_per_iteration_x - 1) * conv_stride_x;
+        num_elems_read_per_iteration_y    = num_elems_written_per_iteration_y + 2;
+    }
+
+    // Create window and update padding
+    Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
+
+    AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(),
+                                       num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
+                                       conv_stride_x, conv_stride_y);
+    AccessWindowStatic    weights_access(weights, 0, 0, 3, 3);
+    AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
+
+    bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLDepthwiseConvolutionLayer3x3NCHWKernel::CLDepthwiseConvolutionLayer3x3NCHWKernel()
+    : _conv_stride_x(0), _conv_pad_top(0)
+{
+}
+
+BorderSize CLDepthwiseConvolutionLayer3x3NCHWKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                                                         unsigned int        depth_multiplier,
+                                                         ActivationLayerInfo act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+    bool is_qasymm = is_data_type_quantized_asymmetric(input->info()->data_type());
+
+    _input         = input;
+    _output        = output;
+    _weights       = weights;
+    _biases        = biases;
+    _conv_stride_x = conv_info.stride().first;
+    _conv_stride_y = conv_info.stride().second;
+    _conv_pad_left = conv_info.pad_left();
+    _conv_pad_top  = conv_info.pad_top();
+    _border_size   = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left);
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
+    build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
+    build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
+
+    if(is_qasymm)
+    {
+        float multiplier        = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale;
+        int   output_multiplier = 0;
+        int   output_shift      = 0;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+        build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
+        build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-_input->info()->quantization_info().offset));
+        build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-_weights->info()->quantization_info().offset));
+        build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(_output->info()->quantization_info().offset));
+        build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * input->info()->quantization_info().offset * weights->info()->quantization_info().offset));
+        build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+        build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+
+        if(act_info.enabled())
+        {
+            const int a_val = input->info()->quantization_info().quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+            const int b_val = input->info()->quantization_info().quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+            const int o1    = input->info()->quantization_info().offset;
+
+            build_opts.add_option("-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
+            build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
+            build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
+            build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
+
+            if(output != nullptr)
+            {
+                const float s1 = input->info()->quantization_info().scale;
+                const float s2 = output->info()->quantization_info().scale;
+                const int   o2 = output->info()->quantization_info().offset;
+
+                if(o1 != o2 || s1 != s2)
+                {
+                    build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
+                    build_opts.add_option("-DS2_VAL=" + float_to_string_with_full_precision(s2));
+                    build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
+                    build_opts.add_option("-DO2_VAL=" + support::cpp11::to_string(o2));
+                }
+            }
+        }
+    }
+
+    // Configure kernel window
+    std::string     kernel_name;
+    const GPUTarget gpu_target = get_target();
+
+    auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, gpu_target, kernel_name);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+Status CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                                          unsigned int        depth_multiplier,
+                                                          ActivationLayerInfo act_info, GPUTarget gpu_target)
+{
+    std::string kernel_name;
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier, gpu_target, kernel_name).first);
+
+    return Status{};
+}
+
+void CLDepthwiseConvolutionLayer3x3NCHWKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Create input window and adjust
+    Window win_in = window;
+    win_in.adjust(Window::DimX, -_conv_pad_left, true);
+    win_in.adjust(Window::DimY, -_conv_pad_top, true);
+    win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
+    win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
+
+    Window slice_in      = win_in.first_slice_window_3D();
+    Window slice_out     = window.first_slice_window_3D();
+    Window slice_weights = window.first_slice_window_3D();
+    slice_weights.set_dimension_step(Window::DimX, 0);
+    slice_weights.set_dimension_step(Window::DimY, 0);
+
+    // Set biases
+    if(_biases != nullptr)
+    {
+        unsigned int idx = 3 * num_arguments_per_3D_tensor();
+        Window       slice_biases;
+        slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
+        add_1D_tensor_argument(idx, _biases, slice_biases);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_3D_tensor_argument(idx, _output, slice_out);
+        add_3D_tensor_argument(idx, _weights, slice_weights);
+
+        enqueue(queue, *this, slice_out, _lws_hint);
+    }
+    while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
+}

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
new file mode 100644
index 0000000..a54e92c
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp

@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+                          const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                                    && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU),
+                                    "For QASYMM8 only relu, lower bounded relu and lower-upper bounded relu are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1); // COMPMID-1071 Add depth multiplier support for NHWC
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) != 3 || weights->dimension(2) != 3);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    if(output->total_size() != 0)
+    {
+        const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *output,
+                                                        const PadStrideInfo &conv_info)
+{
+    const unsigned int num_rows_processed_per_iteration = 4;
+    const unsigned int num_elems_accessed_per_iteration = 4;
+    const unsigned int num_rows_read_per_iteration      = num_rows_processed_per_iteration + 2;
+    const unsigned int num_rows_written_per_iteration   = num_rows_processed_per_iteration / conv_info.stride().first;
+
+    const BorderSize border_size(conv_info.pad_left() + num_rows_read_per_iteration * std::max(conv_info.pad_top(), conv_info.pad_bottom()), 0, conv_info.pad_right(), 0);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output, Steps(num_elems_accessed_per_iteration, num_rows_written_per_iteration));
+
+    AccessWindowStatic input_access(input, 0, -border_size.top, ceil_to_multiple(input->dimension(0), num_elems_accessed_per_iteration),
+                                    ceil_to_multiple(input->dimension(1) + border_size.bottom, num_rows_read_per_iteration));
+    AccessWindowRectangle  output_access(output, 0, 0, num_elems_accessed_per_iteration, num_rows_written_per_iteration);
+    AccessWindowHorizontal weights_access(weights, 0, num_elems_accessed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+
+    if(bias != nullptr)
+    {
+        AccessWindowHorizontal bias_access(bias, 0, num_elems_accessed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win, bias_access);
+    }
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLDepthwiseConvolutionLayer3x3NHWCKernel::CLDepthwiseConvolutionLayer3x3NHWCKernel()
+    : _num_rows_processed_per_iteration(1)
+{
+}
+
+BorderSize CLDepthwiseConvolutionLayer3x3NHWCKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                                                         unsigned int        depth_multiplier,
+                                                         ActivationLayerInfo act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+    // Get convolved dimensions
+    const TensorShape output_shape = compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(),
+                       output_shape,
+                       1,
+                       input->info()->data_type(),
+                       input->info()->fixed_point_position(),
+                       input->info()->quantization_info());
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info));
+
+    const unsigned int conv_stride_x = conv_info.stride().first;
+    ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 2);
+    ARM_COMPUTE_ERROR_ON(std::max(conv_info.pad_top(), conv_info.pad_bottom()) > 1);
+
+    _input                            = input;
+    _output                           = output;
+    _weights                          = weights;
+    _biases                           = biases;
+    _conv_stride_y                    = conv_info.stride().second;
+    _conv_pad_left                    = conv_info.pad_left();
+    _num_rows_processed_per_iteration = 4;
+
+    const unsigned int num_elems_accessed_per_iteration = 4;
+    const unsigned int num_rows_read_per_iteration      = _num_rows_processed_per_iteration + 2;
+
+    _border_size = BorderSize(_conv_pad_left + num_rows_read_per_iteration * std::max(conv_info.pad_top(), conv_info.pad_bottom()), 0, conv_info.pad_right(), 0);
+
+    float multiplier        = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale;
+    int   output_multiplier = 0;
+    int   output_shift      = 0;
+    quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
+    build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-_input->info()->quantization_info().offset));
+    build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-_weights->info()->quantization_info().offset));
+    build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(_output->info()->quantization_info().offset));
+    build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * input->info()->quantization_info().offset * weights->info()->quantization_info().offset));
+    build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+    build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_accessed_per_iteration));
+    build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2)));
+    build_opts.add_option("-DCONV_PAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+    build_opts.add_option("-DROWS_READ=" + support::cpp11::to_string(num_rows_read_per_iteration));
+
+    if(act_info.enabled())
+    {
+        const int a_val = input->info()->quantization_info().quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+        const int b_val = input->info()->quantization_info().quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+        const int o1    = input->info()->quantization_info().offset;
+
+        build_opts.add_option("-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
+        build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
+        build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
+        build_opts.add_option("-DCONST_0=" + support::cpp11::to_string(o1));
+
+        if(output != nullptr)
+        {
+            const float s1 = input->info()->quantization_info().scale;
+            const float s2 = output->info()->quantization_info().scale;
+            const int   o2 = output->info()->quantization_info().offset;
+
+            if(o1 != o2 || s1 != s2)
+            {
+                build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
+                build_opts.add_option("-DS2_VAL=" + float_to_string_with_full_precision(s2));
+                build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
+                build_opts.add_option("-DO2_VAL=" + support::cpp11::to_string(o2));
+            }
+        }
+    }
+
+    // Create kernel
+    std::string kernel_name = std::string("depthwise_convolution_3x3_quantized_nhwc_stride") + support::cpp11::to_string(conv_stride_x);
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+Status CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                                          unsigned int        depth_multiplier,
+                                                          ActivationLayerInfo act_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(),
+                                                              biases != nullptr ? biases->clone().get() : nullptr,
+                                                              output->clone().get(), conv_info)
+                                .first);
+
+    return Status{};
+}
+
+void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Create input window and adjust
+    Window win_in = window;
+    win_in.adjust(Window::DimY, -_conv_pad_left, true);
+    win_in.set_dimension_step(Window::DimY, _num_rows_processed_per_iteration);
+    win_in.set_dimension_step(Window::DimZ, _conv_stride_y);
+
+    ARM_COMPUTE_ERROR_ON((win_in.y().step() < window.y().step()) || (win_in.z().step() < window.z().step()));
+
+    Window slice_in  = win_in.first_slice_window_3D();
+    Window slice_out = window.first_slice_window_3D();
+
+    if(_biases != nullptr)
+    {
+        unsigned int idx = 3 * num_arguments_per_3D_tensor();
+        Window       win_biases;
+        win_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
+        win_biases.set_dimension_step(Window::DimX, window.x().step());
+        add_1D_tensor_argument(idx, _biases, win_biases);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_3D_tensor_argument(idx, _output, slice_out);
+        add_3D_tensor_argument(idx, _weights, slice_out);
+
+        enqueue(queue, *this, slice_out, _lws_hint);
+    }
+    while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
+}

diff --git a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
index 9851475..f44f08b 100644
--- a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp

@@ -42,14 +42,26 @@
 {
 }
 
-void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+namespace
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
-    ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && has_bias);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_UNUSED(conv_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && has_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(2) * depth_multiplier) != output->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
+
+    return Status{};
+}
+} // namespace
+
+void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, depth_multiplier));
 
     _input  = input;
     _output = output;
@@ -68,6 +80,7 @@
     build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
     build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
     build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
+    build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
     build_opts.add_option_if(has_bias, "-DHAS_BIAS");
     build_opts.add_option_if_else(is_data_type_quantized_asymmetric(input->info()->data_type()),
                                   "-DPAD_VALUE=" + support::cpp11::to_string(input->info()->quantization_info().offset),
@@ -77,20 +90,28 @@
 
     // Configure the local work size for Bifrost with a value obtained
     // via exhaustive autotuning for the MobileNets tensor shapes.
-    const GPUTarget gpu_target = get_arch_from_target(get_target());
-    if(gpu_target == GPUTarget::BIFROST)
+    const GPUTarget gpu_target = get_target();
+
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
     {
         _lws_hint = cl::NDRange(1, 2, 1);
     }
 
     // Configure  kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
-    // The CLDepthwiseIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
+    Window win = calculate_max_window(*output->info(), Steps());
+    // CLDepthwiseIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
     output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     ICLKernel::configure(win);
 }
 
+Status CLDepthwiseIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, depth_multiplier));
+
+    return Status{};
+}
+
 void CLDepthwiseIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

diff --git a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
index 83fc168..26336eb 100644
--- a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp

@@ -34,6 +34,34 @@
 
 using namespace arm_compute;
 
+namespace
+{
+TensorShape compute_output_shape(const TensorShape &input, size_t conv_w, size_t conv_h)
+{
+    TensorShape output_shape(input);
+    output_shape.set(0, conv_w);
+    output_shape.set(1, conv_h);
+    output_shape.set(2, input.x() / (conv_w * conv_h));
+
+    return output_shape;
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, size_t conv_w, size_t conv_h)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
+
+    if(output->total_size() != 0)
+    {
+        TensorShape output_shape = compute_output_shape(input->tensor_shape(), conv_w, conv_h);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
 CLDepthwiseVectorToTensorKernel::CLDepthwiseVectorToTensorKernel()
     : _input(nullptr), _output(nullptr)
 {
@@ -41,20 +69,13 @@
 
 void CLDepthwiseVectorToTensorKernel::configure(const ICLTensor *input, ICLTensor *output, size_t conv_w, size_t conv_h)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    TensorShape output_shape = input->info()->tensor_shape();
-    output_shape.set(0, conv_w);
-    output_shape.set(1, conv_h);
-    output_shape.set(2, input->info()->tensor_shape()[0] / (conv_w * conv_h));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output auto inizialitation if not yet initialized
+    TensorShape output_shape = compute_output_shape(input->info()->tensor_shape(), conv_w, conv_h);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), conv_w, conv_h));
 
     _input  = input;
     _output = output;
@@ -75,6 +96,12 @@
     ICLKernel::configure(win);
 }
 
+Status CLDepthwiseVectorToTensorKernel::validate(const ITensorInfo *input, const ITensorInfo *output, size_t conv_w, size_t conv_h)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, conv_w, conv_h));
+    return Status{};
+}
+
 void CLDepthwiseVectorToTensorKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

diff --git a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
index 26da96f..b5a607d 100644
--- a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp

@@ -34,6 +34,29 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *biases)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && (biases != nullptr));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(1));
+    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (input->dimension(0) * input->dimension(1) + ((biases != nullptr) ? 1 : 0)));
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != input->dimension(2));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    return Status{};
+}
+} // namespace
+
 CLDepthwiseWeightsReshapeKernel::CLDepthwiseWeightsReshapeKernel()
     : _input(nullptr), _biases(nullptr), _output(nullptr)
 {
@@ -41,20 +64,8 @@
 
 void CLDepthwiseWeightsReshapeKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *biases)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
-    ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && (biases != nullptr));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) + ((biases != nullptr) ? 1 : 0)));
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != input->info()->dimension(2));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
-    }
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), (biases != nullptr) ? biases->info() : nullptr));
 
     _input  = input;
     _biases = biases;
@@ -80,6 +91,12 @@
     ICLKernel::configure(win);
 }
 
+Status CLDepthwiseWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *biases)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, biases));
+    return Status{};
+}
+
 void CLDepthwiseWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
index 4efdb76..fa982d6 100644
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp

@@ -34,6 +34,46 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
+
+    if(output->tensor_shape().total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
+{
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32, 0);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+    // Configure window
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     min_max_access(min_max, 0, 0, 2, min_max->dimension(1));
+
+    // Update window and padding
+    bool window_changed = update_window_and_padding(win, input_access, output_access, min_max_access);
+
+    output_access.set_valid_region(win, input->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_tuple(err, win);
+}
+} // namespace
+
 CLDequantizationLayerKernel::CLDequantizationLayerKernel()
     : _input(nullptr), _output(nullptr), _min_max(nullptr)
 {
@@ -41,37 +81,30 @@
 
 void CLDequantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output, min_max);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::F32, 0);
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), min_max->info()));
 
     _input   = input;
     _output  = output;
     _min_max = min_max;
 
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("dequantization_layer"));
 
-    // Configure window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     min_max_access(min_max->info(), 0, 0, 2, min_max->info()->dimension(1));
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), min_max->info());
 
-    // Update window and padding
-    update_window_and_padding(win, input_access, output_access, min_max_access);
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
-    output_access.set_valid_region(win, input->info()->valid_region());
+    ICLKernel::configure(std::get<1>(win_config));
+}
 
-    ICLKernel::configure(win);
+Status CLDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, min_max));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), min_max->clone().get())));
+
+    return Status{};
 }
 
 void CLDequantizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 86a3581..7c6c7de 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp

@@ -34,6 +34,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "support/ToolchainSupport.h"
 
@@ -41,26 +42,6 @@
 
 namespace
 {
-/** Calculates expected output shape dimension
- *
- * @param[in] Input shape
- *
- * @return Expected output shape
- */
-TensorShape get_output_shape(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info)
-{
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
-    std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
-
-    TensorShape output_shape = input_shape;
-    output_shape.set(0, output_width);
-    output_shape.set(1, output_height);
-    output_shape.set(2, weights_shape[3]);
-
-    return output_shape;
-}
-
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
@@ -100,7 +81,7 @@
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
-                                                           get_output_shape(input->tensor_shape(), weights->tensor_shape(), conv_info));
+                                                           misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     }
@@ -114,7 +95,7 @@
     const DataType     data_type   = input->data_type();
 
     // Get convolved dimensions
-    TensorShape output_shape = get_output_shape(input->tensor_shape(), weights->tensor_shape(), conv_info);
+    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output, output_shape,
@@ -133,7 +114,8 @@
     unsigned int num_elems_written_per_iteration_x = 0;
     unsigned int num_elems_written_per_iteration_y = 0;
 
-    if((target == GPUTarget::BIFROST) && (kernel_size <= 5) && (conv_stride_x == 1) && (conv_stride_y == 1) && (data_type == DataType::F32))
+    if(gpu_target_is_in(target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) && (kernel_size <= 5) && (conv_stride_x == 1)
+       && (conv_stride_y == 1) && (data_type == DataType::F32))
     {
         // Configure kernel window
 
@@ -273,7 +255,7 @@
     const DataType     data_type   = input->info()->data_type();
 
     // Get convolved dimensions
-    TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);
+    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(),
@@ -299,7 +281,7 @@
     _output  = output;
     _biases  = biases;
 
-    const GPUTarget gpu_target = get_arch_from_target(get_target());
+    const GPUTarget gpu_target = get_target();
 
     std::stringstream kernel_name;
     kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
@@ -307,85 +289,13 @@
     CLBuildOptions build_options;
     build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));
 
-    if((gpu_target == GPUTarget::BIFROST) && (kernel_size <= 5) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (data_type == DataType::F32))
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) && (kernel_size <= 5) && (_conv_stride_x == 1)
+       && (_conv_stride_y == 1) && (data_type == DataType::F32))
     {
         build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))));
 
         kernel_name << "_f32_bifrost";
         _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), build_options.options()));
-
-        // Through extensive experimentation with over 30 representative tensor
-        // shapes, we found a small number of local work size configurations
-        // that result in nearly optimal execution times. Selecting the right
-        // lws for a given shape, however, required a complex decision tree,
-        // until we constructed a simple feature as described below.
-        //
-        // We started from the number of multiply-accumulate operations for a
-        // convolution layer, which is equal to the product of the input
-        // dimensions 0..2 and the weights dimensions 0..2.  Unfortunately,
-        // this resulted in ties between distinct shapes that required distinct
-        // lws configurations. Replacing the width of the input with the kernel
-        // size, however, resulted in nearly optimal predictions. We use underscores
-        // in variable names to indicate when they are intentionally misleading.
-        const size_t product_of_weights_dimensions = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2);
-        const size_t product_of_input_dimensions_  = input->info()->dimension(0) * weights->info()->dimension(1) * input->info()->dimension(2);
-        const float  mega_ops_                     = 1e-6 * product_of_weights_dimensions * product_of_input_dimensions_;
-
-        switch(kernel_size)
-        {
-            case 1:
-            {
-                if(mega_ops_ < 1.f)
-                {
-                    _lws_hint = cl::NDRange(1, 1, 8);
-                }
-                else if(mega_ops_ < 7.f)
-                {
-                    _lws_hint = cl::NDRange(1, 1, 4);
-                }
-                else
-                {
-                    _lws_hint = cl::NDRange(1, 1, 2);
-                }
-                break;
-            }
-            case 3:
-            {
-                if(mega_ops_ < 1.f)
-                {
-                    _lws_hint = cl::NDRange(1, 1, 8);
-                }
-                else if(mega_ops_ < 13.f)
-                {
-                    _lws_hint = cl::NDRange(2, 1, 4);
-                }
-                else if(mega_ops_ < 50.f)
-                {
-                    _lws_hint = cl::NDRange(3, 1, 4);
-                }
-                else
-                {
-                    _lws_hint = cl::NDRange(2, 1, 6);
-                }
-                break;
-            }
-            case 5:
-            {
-                if(mega_ops_ < 2.f || mega_ops_ > 80.f)
-                {
-                    _lws_hint = cl::NDRange(2, 1, 4);
-                }
-                else
-                {
-                    _lws_hint = cl::NDRange(2, 1, 8);
-                }
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
-            }
-        }
     }
     else
     {

diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index 241dd85..8f669a9 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp

@@ -69,19 +69,23 @@
     AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
     window_changed = window_changed || update_window_and_padding(win, input_access);
 
-    // Configure window in case of configured output
-    if(output->total_size() != 0)
-    {
-        const float scale_x = 4.0f * static_cast<float>(mult_interleave4x4_height);
-        const float scale_y = 1.0f / (scale_x);
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_interleaved_shape(*input, mult_interleave4x4_height)));
 
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, scale_x, scale_y);
-        window_changed = window_changed || update_window_and_padding(win, output_access);
-        output_access.set_valid_region(win, input->valid_region());
-    }
+    // Configure window
+    const float scale_x = 4.0f * static_cast<float>(mult_interleave4x4_height);
+    const float scale_y = 1.0f / (scale_x);
+
+    AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, scale_x, scale_y);
+    window_changed = window_changed || update_window_and_padding(win, output_access);
+    output_access.set_valid_region(win, input->valid_region());
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window collapsed = win.collapse(win, Window::DimZ);
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(err, collapsed);
 }
 } // namespace
 
@@ -136,6 +140,10 @@
     _config_id += support::cpp11::to_string(output->info()->dimension(0));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(3));
 }
 
 Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height)
@@ -160,15 +168,14 @@
      *
      * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
      */
-    Window in_slice  = window.first_slice_window_2D();
-    Window out_slice = window.first_slice_window_2D();
+    Window slice = window.first_slice_window_3D();
 
     do
     {
         unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, in_slice);
-        add_2D_tensor_argument(idx, _output, out_slice);
-        enqueue(queue, *this, in_slice, _lws_hint);
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, _lws_hint);
     }
-    while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+    while(window.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
index ae498ec..3f705ac 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp

@@ -31,6 +31,7 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"

diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
index d5c93dd..d409fdb 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,7 +52,7 @@
                                                         unsigned int &num_elems_processed_per_iteration)
 {
     // Select the vector size to use (8 for Bifrost; 16 for Midgard).
-    num_elems_processed_per_iteration = (gpu_target == GPUTarget::BIFROST) ? 8 : 16;
+    num_elems_processed_per_iteration = gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) ? 8 : 16;
 
     // Configure kernel window
     Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
@@ -81,12 +81,12 @@
     _biases = biases;
     _accum  = accum;
 
-    // Get the target architecture
-    GPUTarget    arch_target = get_arch_from_target(get_target());
+    // Get the target gpu
+    GPUTarget    gpu_target  = get_target();
     unsigned int vector_size = 0;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(accum->info(), biases->info(), arch_target, vector_size);
+    auto win_config = validate_and_configure_window(accum->info(), biases->info(), gpu_target, vector_size);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure(win_config.second);
 

diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
index 4b4814f..4538812 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp

@@ -58,16 +58,15 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const float beta)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float beta)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_UNUSED(input, output, beta);
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != output->dimension(0));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != output->dimension(1));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
 
-    ARM_COMPUTE_UNUSED(beta);
     return Status{};
 }
 } // namespace
@@ -114,11 +113,10 @@
     ICLKernel::configure(win_config.second);
 }
 
-Status CLGEMMMatrixAdditionKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const float beta)
+Status CLGEMMMatrixAdditionKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(validate_arguments(input, output, beta));
-    ARM_COMPUTE_RETURN_ERROR_ON(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, beta));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
     return Status{};
 }
 

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 6c31e37..cc9ae27 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp

@@ -32,6 +32,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
@@ -54,6 +55,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
 
     if(!is_interleaved_transposed)
     {
@@ -105,7 +107,7 @@
 }
 
 inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output,
-                                                               bool is_interleaved_transposed, GPUTarget gpu_target,
+                                                               bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,
                                                                ElementsProcessed &num_elements_processed)
 {
     bool   window_changed = false;
@@ -115,6 +117,9 @@
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
 
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info)));
+
     if(is_interleaved_transposed)
     {
         // Configure kernel window
@@ -124,7 +129,9 @@
         win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
         AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
-        AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
+        AccessWindowStatic    input1_access(input1, 0, 0,
+                                            ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+                                            ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));
         AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
 
         window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
@@ -138,7 +145,8 @@
         num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
 
         // Create kernels according to the architecture, data type and input size.
-        if(gpu_target == GPUTarget::BIFROST && data_type == DataType::F32)
+        GPUTarget arch_target = get_arch_from_target(gpu_target);
+        if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
         {
             num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000 && input0->num_dimensions() == 1) ? 2 : 4;
         }
@@ -157,13 +165,19 @@
         output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
     }
 
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(err, collapsed);
 }
 } // namespace
 
 CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)
 {
 }
 
@@ -171,45 +185,64 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
 
-    // Output tensor auto inizialitation if not yet initialized
-    TensorShape tensor_shape{ input0->info()->tensor_shape() };
-    tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->info()->dimension(0));
-    tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->info()->dimension(1));
-
-    auto_init_if_empty(*output->info(), input0->info()->clone()->set_tensor_shape(tensor_shape));
-
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
 
-    _input0 = input0;
-    _input1 = input1;
-    _output = output;
+    _input0         = input0;
+    _input1         = input1;
+    _output         = output;
+    _slide_matrix_b = _input1->info()->num_dimensions() >= _input0->info()->num_dimensions();
 
     const DataType data_type = input0->info()->data_type();
     const int      fp_pos    = input0->info()->fixed_point_position();
 
     // Get target architecture
-    GPUTarget arch_target = get_arch_from_target(get_target());
+    GPUTarget gpu_target = get_target();
 
     // Configure LWS hint
-    if(arch_target == GPUTarget::BIFROST && input1->info()->dimension(1) == 24)
+    switch(gpu_target)
     {
-        // LWS optimized for the 11x11 AlexNet convolution on Bifrost.
-        _lws_hint = cl::NDRange(2, 2);
-    }
-    else if(output->info()->dimension(1) == 196)
-    {
-        _lws_hint = cl::NDRange(1, 7);
-    }
-    else
-    {
-        _lws_hint = cl::NDRange(8, 8);
+        case GPUTarget::MIDGARD:
+        case GPUTarget::T600:
+        case GPUTarget::T700:
+        case GPUTarget::T800:
+            if(output->info()->dimension(1) == 196)
+            {
+                _lws_hint = cl::NDRange(1, 7);
+            }
+            else
+            {
+                _lws_hint = cl::NDRange(8, 8);
+            }
+            break;
+        case GPUTarget::G71:
+        case GPUTarget::G72:
+        case GPUTarget::G51:
+        case GPUTarget::G51BIG:
+        case GPUTarget::G51LIT:
+        case GPUTarget::TNOX:
+            if(input1->info()->dimension(1) == 24)
+            {
+                // LWS optimized for the 11x11 AlexNet convolution on Bifrost.
+                _lws_hint = cl::NDRange(2, 2);
+            }
+            else if(output->info()->dimension(1) == 196)
+            {
+                _lws_hint = cl::NDRange(1, 7);
+            }
+            else
+            {
+                _lws_hint = cl::NDRange(8, 8);
+            }
+            break;
+        default:
+            _lws_hint = cl::NullRange;
     }
 
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, arch_target, num_elements_processed);
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, gpu_target, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure(win_config.second);
 
@@ -225,6 +258,11 @@
                                       "-DALPHA=" + float_to_string_with_full_precision(alpha));
     }
 
+    // Do not slide matrix B if _slide_matrix_b = false
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+
+    const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
+
     std::string kernel_name;
     if(is_interleaved_transposed)
     {
@@ -235,9 +273,9 @@
         build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
         build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
 
-        if(data_type == DataType::F32)
+        if(is_data_type_float(data_type) && is_bifrost)
         {
-            kernel_name = "gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target);
+            kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
         }
         else
         {
@@ -247,14 +285,24 @@
     else // The input tensors have not been reshaped
     {
         build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
+        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
 
         // Create kernels according to the architecture, data type and input size.
-        if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
+        if(is_data_type_float(data_type) && is_bifrost)
         {
-            // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
-            // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
-            // FC6 and FC7 of AlexNet and VGG-16).
-            kernel_name = (input1->info()->dimension(0) <= 1000 && input0->info()->num_dimensions() == 1) ? "gemm_mm_floating_point_f32_bifrost_1000" : "gemm_mm_floating_point_f32_bifrost";
+            kernel_name = "gemm_mm_floating_point";
+
+            if(input0->info()->num_dimensions() != 1)
+            {
+                kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
+            }
+            else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)
+            {
+                // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
+                // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
+                // FC6 and FC7 of AlexNet and VGG-16).
+                kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";
+            }
 
             // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
             // via exhaustive autotuning over a range of representative layer configurations.
@@ -266,7 +314,6 @@
         }
         else // (MIDGARD and F32) or (F16)
         {
-            build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
             kernel_name = "gemm_mm_floating_point";
         }
         build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
@@ -285,6 +332,10 @@
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(0));
     _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(3));
+    _config_id += "_";
     _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
 }
 
@@ -299,6 +350,7 @@
                                                               input1->clone().get(),
                                                               output->clone().get(),
                                                               is_interleaved_transposed,
+                                                              reshape_info,
                                                               gpu_target,
                                                               num_elements_processed)
                                 .first);
@@ -311,7 +363,13 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window slice          = window.first_slice_window_2D();
+    if(_input1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    Window slice          = window.first_slice_window_3D();
     Window slice_matrix_b = slice;
 
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -321,8 +379,8 @@
     {
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        if(_input1->info()->num_dimensions() < 3)
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+        if(!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -331,7 +389,10 @@
         add_2D_tensor_argument(idx, _input0, slice);
         add_2D_tensor_argument(idx, _input1, slice_b);
         add_2D_tensor_argument(idx, _output, slice);
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
         enqueue(queue, *this, slice, _lws_hint);
     }
-    while(window.slide_window_slice_2D(slice));
+    while(window.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
index cc483dc..b2ea95b 100644
--- a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp

@@ -34,6 +34,42 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input0->data_type()) && (output->data_type() != DataType::S32));
+    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(2) != input1->dimension(1));
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_read_per_iteration = 4;
+    constexpr unsigned int num_rows_read_per_iteration  = 4;
+
+    const unsigned int border_x = ceil_to_multiple(input0->dimension(0), num_elems_read_per_iteration) - input0->dimension(0);
+    const unsigned int border_y = ceil_to_multiple(input0->dimension(1), num_rows_read_per_iteration) - input0->dimension(1);
+
+    Window win = calculate_max_window(*input0, Steps(num_elems_read_per_iteration));
+
+    AccessWindowRectangle  input0_access(input0, 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal input1_access(input1, 0, num_elems_read_per_iteration);
+    AccessWindowStatic     output_access(output, 0, 0, output->dimension(0) + border_x, output->dimension(1) + border_y);
+
+    bool window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 CLGEMMMatrixVectorMultiplyKernel::CLGEMMMatrixVectorMultiplyKernel()
     : _input0(nullptr), _input1(nullptr), _output(nullptr), _num_rows_read_per_iteration(0), _border_size(0)
 {
@@ -45,11 +81,8 @@
 
 void CLGEMMMatrixVectorMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-    ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input0->info()->data_type()) && (output->info()->data_type() != DataType::S32));
-    ARM_COMPUTE_ERROR_ON(input0->info()->dimension(2) != input1->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
 
     _input0 = input0;
     _input1 = input1;
@@ -77,8 +110,8 @@
 
     // Configure the local work size for Bifrost with a value obtained
     // via exhaustive autotuning for the MobileNets tensor shapes.
-    const GPUTarget gpu_target = get_arch_from_target(get_target());
-    if(gpu_target == GPUTarget::BIFROST)
+    const GPUTarget gpu_target = get_target();
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
     {
         _lws_hint = cl::NDRange(1, 1, 1);
     }
@@ -93,17 +126,17 @@
 
     _border_size = BorderSize(border_y, border_x);
 
-    Window win = calculate_max_window(*input0->info(), Steps(num_elems_read_per_iteration));
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+}
 
-    AccessWindowRectangle  input0_access(input0->info(), 0, 0, num_elems_read_per_iteration, _num_rows_read_per_iteration);
-    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_read_per_iteration);
-    AccessWindowStatic     output_access(_output->info(), 0, 0, _output->info()->dimension(0) + border_x, _output->info()->dimension(1) + border_y);
+Status CLGEMMMatrixVectorMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
 
-    update_window_and_padding(win, input0_access, input1_access, output_access);
-
-    _output->info()->set_valid_region(ValidRegion(Coordinates(), _output->info()->tensor_shape()));
-
-    ICLKernel::configure(win);
+    return Status{};
 }
 
 void CLGEMMMatrixVectorMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index 24d2187..05a20fd 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp

@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
 
+#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -70,24 +71,21 @@
     // Configure kernel window
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
 
-    if((win.x().end() / scale_x) == 0)
-    {
-        return std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Transposed shape would be 0 in the second dimension"), win);
-    }
-
     AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    window_changed = window_changed || update_window_and_padding(win, input_access);
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input, mult_transpose1xW_width)));
 
     // Configure window in case of configured output
-    if(output->total_size() != 0)
-    {
-        AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
-        window_changed = window_changed || update_window_and_padding(win, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape()));
-    }
+    AccessWindowStatic output_access(output, 0, 0, ceil_to_multiple(output->dimension(0), scale_x), output->dimension(1));
+    window_changed = window_changed || update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape()));
+
+    // Collapse along the Z direction
+    Window collapsed = win.collapse(win, Window::DimZ);
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(err, collapsed);
 }
 } // namespace
 
@@ -151,15 +149,15 @@
     out_window.set(Window::DimX, window.y());
     out_window.set(Window::DimY, window.x());
 
-    Window in_slice  = window.first_slice_window_2D();
-    Window out_slice = out_window.first_slice_window_2D();
+    Window in_slice  = window.first_slice_window_3D();
+    Window out_slice = out_window.first_slice_window_3D();
 
     do
     {
         unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, in_slice);
-        add_2D_tensor_argument(idx, _output, out_slice);
+        add_3D_tensor_argument(idx, _input, in_slice);
+        add_3D_tensor_argument(idx, _output, out_slice);
         enqueue(queue, *this, in_slice, _lws_hint);
     }
-    while(window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+    while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_3D(out_slice));
 }

diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
index 34a228c..a4fda36 100644
--- a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
+++ b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,20 +32,19 @@
 using namespace arm_compute;
 
 CLGaussianPyramidHorKernel::CLGaussianPyramidHorKernel()
-    : _border_size(0), _l2_load_offset(0)
+    : _l2_load_offset(0)
 {
 }
 
 BorderSize CLGaussianPyramidHorKernel::border_size() const
 {
-    return _border_size;
+    return BorderSize(0, 2);
 }
 
-void CLGaussianPyramidHorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+void CLGaussianPyramidHorKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != 2 * output->info()->dimension(0));
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
 
     for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
@@ -53,9 +52,8 @@
         ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
     }
 
-    _input       = input;
-    _output      = output;
-    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
+    _input  = input;
+    _output = output;
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gaussian1x5_sub_x"));
@@ -64,9 +62,9 @@
     constexpr unsigned int num_elems_processed_per_iteration = 16;
     constexpr unsigned int num_elems_read_per_iteration      = 20;
     constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr float        scale_x                           = 0.5f;
+    const float            scale_x                           = static_cast<float>(output->info()->dimension(0)) / input->info()->dimension(0);
 
-    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration, scale_x);
 
     // Sub sampling selects odd pixels (1, 3, 5, ...) for images with even
@@ -95,11 +93,7 @@
                               AccessWindowHorizontal(input->info(), _l2_load_offset, num_elems_read_per_iteration),
                               output_access);
 
-    ValidRegion valid_region = input->info()->valid_region();
-    valid_region.anchor.set(0, std::ceil((valid_region.anchor[0] + (border_undefined ? border_size().left : 0)) / 2.f));
-    valid_region.shape.set(0, (valid_region.shape[0] - (border_undefined ? border_size().right : 0)) / 2 - valid_region.anchor[0]);
-
-    output_access.set_valid_region(win, valid_region);
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     ICLKernel::configure(win);
 }
@@ -139,12 +133,11 @@
     return BorderSize(2, 0);
 }
 
-void CLGaussianPyramidVertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+void CLGaussianPyramidVertKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != 2 * output->info()->dimension(1));
 
     for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
     {
@@ -163,10 +156,10 @@
     constexpr unsigned int num_elems_written_per_iteration   = 8;
     constexpr unsigned int num_elems_read_per_iteration      = 8;
     constexpr unsigned int num_rows_per_iteration            = 5;
-    constexpr float        scale_y                           = 0.5f;
 
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration),
-                                      border_undefined, border_size());
+    const float scale_y = static_cast<float>(output->info()->dimension(1)) / input->info()->dimension(1);
+
+    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration));
     AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_per_iteration, 1.f, scale_y);
 
     // Determine whether we need to load even or odd rows. See above for a
@@ -182,11 +175,7 @@
                               AccessWindowRectangle(input->info(), 0, _t2_load_offset, num_elems_read_per_iteration, num_rows_per_iteration),
                               output_access);
 
-    ValidRegion valid_region = input->info()->valid_region();
-    valid_region.anchor.set(1, std::ceil((valid_region.anchor[1] + (border_undefined ? border_size().top : 0)) / 2.f));
-    valid_region.shape.set(1, (valid_region.shape[1] - (border_undefined ? border_size().bottom : 0)) / 2 - valid_region.anchor[1]);
-
-    output_access.set_valid_region(win, valid_region);
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     ICLKernel::configure(win);
 }

diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
index 87659c4..a15aab1 100644
--- a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
+++ b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -172,7 +172,7 @@
                               AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
                               output_access);
 
-    output_access.set_valid_region(win, input->info()->valid_region());
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     ICLKernel::configure(win);
 }

diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
index 0f9a989..caca498 100644
--- a/src/core/CL/kernels/CLHOGDetectorKernel.cpp
+++ b/src/core/CL/kernels/CLHOGDetectorKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,10 +70,10 @@
     args_str << "-DTHRESHOLD=" << threshold << " ";
     args_str << "-DMAX_NUM_DETECTION_WINDOWS=" << detection_windows->max_num_values() << " ";
     args_str << "-DIDX_CLASS=" << idx_class << " ";
-    args_str << "-DBLOCK_STRIDE_WIDTH=" << block_stride.width << " ";
-    args_str << "-DBLOCK_STRIDE_HEIGHT=" << block_stride.height << " ";
     args_str << "-DDETECTION_WINDOW_WIDTH=" << detection_window_size.width << " ";
     args_str << "-DDETECTION_WINDOW_HEIGHT=" << detection_window_size.height << " ";
+    args_str << "-DDETECTION_WINDOW_STRIDE_WIDTH=" << detection_window_stride.width << " ";
+    args_str << "-DDETECTION_WINDOW_STRIDE_HEIGHT=" << detection_window_stride.height << " ";
 
     // Construct kernel name
     std::set<std::string> build_opts = {};
@@ -102,8 +102,8 @@
 
     // Configure kernel window
     Window win;
-    win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x), window_step_x));
-    win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y), window_step_y));
+    win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x) + window_step_x, window_step_x));
+    win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y) + window_step_y, window_step_y));
 
     constexpr unsigned int num_elems_read_per_iteration = 1;
     const unsigned int     num_rows_read_per_iteration  = num_blocks_per_descriptor_y;

diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index b75d264..d04c1dc 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp

@@ -41,11 +41,12 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, bool has_bias)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, bool has_bias, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
 
     // Checks performed when output is configured
     if(output->total_size() != 0)
@@ -63,19 +64,19 @@
 {
 }
 
-void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), has_bias));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), has_bias, dilation));
 
     _input       = input;
     _output      = output;
     _kernel_dims = kernel_dims;
 
     const DataType  data_type  = input->info()->data_type();
-    const GPUTarget gpu_target = get_arch_from_target(get_target());
+    const GPUTarget gpu_target = get_target();
 
     // Create kernel
     CLBuildOptions build_opts;
@@ -107,7 +108,7 @@
 
         _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
                                             kernel_dims.width, kernel_dims.height,
-                                            conv_info);
+                                            conv_info, dilation);
 
         build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
         build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
@@ -122,77 +123,82 @@
         build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
         build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
         build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+        build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
+        build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
         build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(input->info()->quantization_info().offset), "-DPAD_VALUE=0");
 
         const bool squared_im2col = kernel_dims.width == kernel_dims.height;
 
-        if(squared_im2col && !is_data_type_fixed_point(data_type))
+        if(dilation == Size2D(1U, 1U))
         {
-            // Check if we can run an optimized im2col
-            switch(kernel_dims.width)
+            if(squared_im2col && !is_data_type_fixed_point(data_type))
             {
-                case 1:
-                    // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
-                    if(conv_info.stride().first == 1 && !conv_info.has_padding())
-                    {
-                        // Set hint for LWS
+                // Check if we can run an optimized im2col
+                switch(kernel_dims.width)
+                {
+                    case 1:
+                        // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
+                        if(conv_info.stride().first == 1 && !conv_info.has_padding())
+                        {
+                            // Set hint for LWS
+                            _lws_hint                          = cl::NDRange(1, 1, 8);
+                            _num_elems_processed_per_iteration = 4;
+                            is_optimized_path                  = true;
+                            kernel_name                        = "im2col1x1_stridex1_dchw";
+                        }
+                        break;
+                    case 3:
                         _lws_hint                          = cl::NDRange(1, 1, 8);
-                        _num_elems_processed_per_iteration = 4;
-                        is_optimized_path                  = true;
-                        kernel_name                        = "im2col1x1_stridex1_dchw";
-                    }
-                    break;
-                case 3:
-                    _lws_hint                          = cl::NDRange(1, 1, 8);
-                    _num_elems_processed_per_iteration = 1;
-                    is_optimized_path                  = true;
-                    kernel_name                        = "im2col3x3_dchw";
-                    break;
-                case 5:
-                    _num_elems_processed_per_iteration = 1;
-                    is_optimized_path                  = true;
-                    kernel_name                        = "im2col5x5_dchw";
-                    break;
-                case 11:
-                    // Optimized im2col11x11 if pad_x = pad_y = 0
-                    if(!conv_info.has_padding())
-                    {
                         _num_elems_processed_per_iteration = 1;
                         is_optimized_path                  = true;
-                        kernel_name                        = "im2col11x11_padx0_pady0_dchw";
-                    }
-                    break;
-                default:
-                    is_optimized_path = false;
-                    break;
+                        kernel_name                        = "im2col3x3_dchw";
+                        break;
+                    case 5:
+                        _num_elems_processed_per_iteration = 1;
+                        is_optimized_path                  = true;
+                        kernel_name                        = "im2col5x5_dchw";
+                        break;
+                    case 11:
+                        // Optimized im2col11x11 if pad_x = pad_y = 0
+                        if(!conv_info.has_padding())
+                        {
+                            _num_elems_processed_per_iteration = 1;
+                            is_optimized_path                  = true;
+                            kernel_name                        = "im2col11x11_padx0_pady0_dchw";
+                        }
+                        break;
+                    default:
+                        is_optimized_path = false;
+                        break;
+                }
             }
-        }
-        else if(kernel_dims.width > 1 && !conv_info.has_padding())
-        {
-            _num_elems_processed_per_iteration = 1;
-            kernel_name                        = "im2col_generic_padx0_pady0_dchw";
+            else if(kernel_dims.width > 1 && !conv_info.has_padding())
+            {
+                _num_elems_processed_per_iteration = 1;
+                kernel_name                        = "im2col_generic_padx0_pady0_dchw";
 
-            // Optimized im2col is performed using one or more vector operations with the specified vector size
-            // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
-            // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
-            // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
-            // Using the vector size of 8, however, may be faster.
-            size_t vector_size = 4;
-            // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
-            // is used instead.)
-            if(kernel_dims.width < vector_size)
-            {
-                vector_size = kernel_dims.width;
+                // Optimized im2col is performed using one or more vector operations with the specified vector size
+                // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
+                // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
+                // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
+                // Using the vector size of 8, however, may be faster.
+                size_t vector_size = 4;
+                // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
+                // is used instead.)
+                if(kernel_dims.width < vector_size)
+                {
+                    vector_size = kernel_dims.width;
+                }
+                // Local work size and vector size optimized for the 11x11 AlexNet convolution on Bifrost.
+                if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) && kernel_dims.width == 11)
+                {
+                    _lws_hint   = cl::NDRange(1, 1, 1);
+                    vector_size = 8;
+                }
+                const size_t width_mod_vector_size = kernel_dims.width % vector_size;
+                build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+                build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
             }
-            // Local work size and vector size optimized for the 11x11 AlexNet convolution on Bifrost.
-            if(gpu_target == GPUTarget::BIFROST && kernel_dims.width == 11)
-            {
-                _lws_hint   = cl::NDRange(1, 1, 1);
-                vector_size = 8;
-            }
-            const size_t width_mod_vector_size = kernel_dims.width % vector_size;
-            build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-            build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
         }
         _run_func = &CLIm2ColKernel::run_generic;
     }
@@ -206,7 +212,7 @@
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
-    // Configure  kernel window
+    // Configure kernel window
     Window win;
     if(is_optimized_path)
     {
@@ -250,12 +256,12 @@
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
-Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
 {
     ARM_COMPUTE_UNUSED(kernel_dims);
     ARM_COMPUTE_UNUSED(conv_info);
     ARM_COMPUTE_UNUSED(has_bias);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, has_bias));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, has_bias, dilation));
     return Status{};
 }
 

diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 36e351e..3d30350 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,18 +42,60 @@
 {
 }
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, unsigned int axis, float epsilon)
+{
+    ARM_COMPUTE_UNUSED(epsilon);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+
+    // Reduce shape on axis
+    TensorShape sum_shape = input->tensor_shape();
+    sum_shape.set(axis, 1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
+    }
+
+    return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    const unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type(), input->fixed_point_position());
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, input->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+
+    return std::make_tuple(err, win);
+}
+} // namespace
+
 void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, unsigned int axis, float epsilon)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    // Sum and output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
-    ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
 
     _input   = input;
     _sum     = sum;
@@ -76,15 +118,18 @@
     _kernel.setArg<cl_uint>(idx, _epsilon);
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    auto win_config = validate_and_configure_window(_input->info(), _output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    ICLKernel::configure(std::get<1>(win_config));
+}
 
-    update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, input->info()->valid_region());
+Status CLL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, unsigned int axis, float epsilon)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
 
-    ICLKernel::configure(win);
+    return Status{};
 }
 
 void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLLKTrackerKernel.cpp b/src/core/CL/kernels/CLLKTrackerKernel.cpp
index 12cdd0e..078d18e 100644
--- a/src/core/CL/kernels/CLLKTrackerKernel.cpp
+++ b/src/core/CL/kernels/CLLKTrackerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -249,8 +249,12 @@
             static_cast<cl_float>(valid_region.start(0))
         }
     };
-    const int term_iteration = (termination == Termination::TERM_CRITERIA_ITERATIONS || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0;
-    const int term_epsilon   = (termination == Termination::TERM_CRITERIA_EPSILON || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0;
+
+    // Set maximum number of iterations used for convergence
+    const size_t max_iterations = 1000;
+    num_iterations              = (termination == Termination::TERM_CRITERIA_EPSILON) ? max_iterations : num_iterations;
+
+    const int term_epsilon = (termination == Termination::TERM_CRITERIA_EPSILON || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0;
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("lktracker_stage1"));
@@ -268,7 +272,6 @@
     _kernel.setArg<cl_float3>(idx++, border_limits);
     _kernel.setArg<cl_float>(idx++, eig_const);
     _kernel.setArg<cl_int>(idx++, level0);
-    _kernel.setArg<cl_int>(idx++, term_iteration);
     _kernel.setArg<cl_int>(idx++, term_epsilon);
 }
 

diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
index a3af5b0..84f2e0c 100644
--- a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,13 +46,44 @@
 {
 }
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+
+    return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+{
+    const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->data_type());
+
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x));
+
+    AccessWindowHorizontal input0_access(input0, 0, num_elems_processed_per_iteration_x);
+    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration_x);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x);
+
+    bool window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+
+    return std::make_tuple(err, win);
+}
+} // namespace
+
 void CLLocallyConnectedMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
-    ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
 
     _input0 = input0;
     _input1 = input1;
@@ -77,20 +108,20 @@
     std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
     _kernel                    = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_lc_vm_" + data_type_name), build_opts));
 
-    // Configure window kernel
-    const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
 
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
-    AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
-    AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
-    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+    ICLKernel::configure(std::get<1>(win_config));
+}
 
-    update_window_and_padding(win, input0_access, input1_access, output_access);
+Status CLLocallyConnectedMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get())));
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure(win);
+    return Status{};
 }
 
 void CLLocallyConnectedMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
index 8ba1f77..60dd5e7 100644
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp

@@ -30,10 +30,55 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <climits>
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
+
+    if(output->tensor_shape().total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+        TensorShape output_shape = compute_min_max_shape(input);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+    }
+
+    return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    TensorShape output_shape = compute_min_max_shape(input);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->fixed_point_position());
+
+    const unsigned int num_elems_processed_per_iteration = 1;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     output_access(output, 0, 0, 2, output->dimension(1));
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_tuple(err, win);
+}
+} // namespace
 
 CLMinMaxLayerKernel::CLMinMaxLayerKernel()
     : _input(nullptr), _output(nullptr)
@@ -42,26 +87,12 @@
 
 void CLMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    TensorShape output_shape{ input->info()->tensor_shape() };
-    output_shape.set(Window::DimX, 2);
-    output_shape.remove_dimension(1);
-    output_shape.remove_dimension(1);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input  = input;
     _output = output;
 
-    const unsigned int num_elems_processed_per_iteration = 1;
-
     std::set<std::string> build_opts;
     build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
     build_opts.emplace("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
@@ -70,16 +101,19 @@
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmax_layer", build_opts));
 
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     output_access(output->info(), 0, 0, 2, output->info()->dimension(1));
+    auto win_config = validate_and_configure_window(input->info(), output->info());
 
-    update_window_and_padding(win, input_access, output_access);
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    ICLKernel::configure(std::get<1>(win_config));
+}
 
-    ICLKernel::configure(win);
+Status CLMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+
+    return Status{};
 }
 
 void CLMinMaxLayerKernel::reset(cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
index da34448..d20bee1 100644
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp

@@ -114,13 +114,18 @@
     // Configure  kernel window
     Window win = calculate_max_window(*input->info(), Steps());
 
+    // The CLPermute doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
     ICLKernel::configure(win);
 }
 
 Status CLPermuteKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(validate_arguments(input, output, perm));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, perm));
 
     return Status{};
 }

diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index b3034e1..02fa283 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp

@@ -34,53 +34,52 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <set>
 #include <string>
 #include <tuple>
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
 // Internal window config info
 using CLPoolingConfig = std::pair<unsigned int, BorderSize>; //num_elems_processed_per_iteration, border_size
 
-void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h)
+void auto_init(const ITensorInfo *input, ITensorInfo *output, PoolingLayerInfo pool_info)
 {
-    TensorShape output_shape{ input->tensor_shape() };
-    output_shape.set(0, pooled_w);
-    output_shape.set(1, pooled_h);
-
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
+    TensorShape out_shape = compute_pool_shape(*input, pool_info);
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(out_shape));
 }
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    DataLayout data_layout = input->data_layout();
+    switch(data_layout)
+    {
+        case DataLayout::NCHW:
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+            break;
+        case DataLayout::NHWC:
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data layout not supported");
+    }
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type() == PoolingType::L2),
                                     "Unsupported combination of parameters!");
 
-    const bool         is_global_pooling = pool_info.is_global_pooling();
-    const unsigned int pool_size_x       = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size().width;
-    const unsigned int pool_size_y       = is_global_pooling ? input->tensor_shape().y() : pool_info.pool_size().height;
-
     // Checks performed when output is configured
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
-
-        unsigned int pooled_w = 0;
-        unsigned int pooled_h = 0;
-        std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
-                                                         input->dimension(1),
-                                                         pool_size_x,
-                                                         pool_size_y,
-                                                         pool_info.pad_stride_info());
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h),
-                                        "Invalid output pooling dimensions!");
+        TensorInfo out_info(TensorInfo(compute_pool_shape(*input, pool_info), 1, output->data_type(), output->fixed_point_position()));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
     }
 
     return Status{};
@@ -88,59 +87,82 @@
 
 std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Get data layout
+    const DataLayout data_layout = input->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
     int                 pool_stride_x   = 0;
     int                 pool_stride_y   = 0;
     unsigned int        pooled_w        = 0;
     unsigned int        pooled_h        = 0;
-    int                 pool_size_x     = pool_info.is_global_pooling() ? input->dimension(0) : pool_info.pool_size().width;
-    int                 pool_size_y     = pool_info.is_global_pooling() ? input->dimension(1) : pool_info.pool_size().height;
+    int                 pool_size_x     = pool_info.is_global_pooling() ? input->dimension(idx_width) : pool_info.pool_size().width;
+    int                 pool_size_y     = pool_info.is_global_pooling() ? input->dimension(idx_height) : pool_info.pool_size().height;
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int pool_pad_right  = pad_stride_info.pad_right();
-    const int pool_pad_top    = pad_stride_info.pad_top();
-    const int pool_pad_left   = pad_stride_info.pad_left();
-    const int pool_pad_bottom = pad_stride_info.pad_bottom();
+    const int  pool_pad_right  = pad_stride_info.pad_right();
+    const int  pool_pad_top    = pad_stride_info.pad_top();
+    const int  pool_pad_left   = pad_stride_info.pad_left();
+    const int  pool_pad_bottom = pad_stride_info.pad_bottom();
+    BorderSize border_size     = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
 
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    auto_init(input, output, pool_info);
+    pooled_w = output->tensor_shape()[idx_width];
+    pooled_h = output->tensor_shape()[idx_height];
 
-    // Check output dimensions
-    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
-                                                     input->dimension(1),
-                                                     pool_size_x,
-                                                     pool_size_y,
-                                                     pad_stride_info);
+    const DataType data_type = input->data_type();
 
-    auto_init(input, output, pooled_w, pooled_h);
+    const int input_width  = input->dimension(idx_width);
+    const int input_height = input->dimension(idx_height);
 
-    BorderSize     border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
-    const DataType data_type   = input->data_type();
+    unsigned int num_elems_processed_per_iteration = 0;
+    bool         window_changed                    = false;
+    Window       win{};
+    switch(data_layout)
+    {
+        case DataLayout::NCHW:
+        {
+            // Change the number of elements processed per iteration
+            // for pooling 3x3 with stride less equal than 3
+            const bool can_optimize                         = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
+            num_elems_processed_per_iteration               = can_optimize ? 4 : 1;
+            const unsigned int num_elems_read_per_iteration = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size_x;
 
-    const int input_width  = input->dimension(0);
-    const int input_height = input->dimension(1);
+            // Number of iterations in X dimension
+            const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
 
-    // Change the number of elements processed per iteration
-    // for pooling 3x3 with stride less equal than 3
-    const bool         can_optimize                      = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
-    const unsigned int num_elems_processed_per_iteration = can_optimize ? 4 : 1;
-    const int          num_elems_read_per_iteration      = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size_x;
+            // Upper limit for the number of right/bottom border elements that are accessed
+            const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
+            const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
 
-    // Number of iterations in X dimension
-    const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
+            border_size.right  = std::max(upper_bound_w, pool_pad_right);
+            border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
 
-    // Upper limit for the number of right/bottom border elements that are accessed
-    const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
-    const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
+            win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
 
-    border_size.right  = std::max(upper_bound_w, pool_pad_right);
-    border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
+            AccessWindowRectangle input_access(input, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y,
+                                               pool_stride_x, pool_stride_y);
+            AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+            window_changed = update_window_and_padding(win, input_access, output_access);
+            output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            num_elems_processed_per_iteration = 8;
+            win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
 
-    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowRectangle input_access(input, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y,
-                                       pool_stride_x * num_elems_processed_per_iteration, pool_stride_y);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+            AccessWindowRectangle  input_access(input, 0, -pool_pad_left, num_elems_processed_per_iteration, pool_size_x);
+            AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+            window_changed = update_window_and_padding(win, input_access, output_access);
+            output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_tuple(err, win, CLPoolingConfig(num_elems_processed_per_iteration, border_size));
@@ -159,30 +181,25 @@
 
 void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
     int                 pool_stride_x   = 0;
     int                 pool_stride_y   = 0;
-    unsigned int        pooled_w        = 0;
-    unsigned int        pooled_h        = 0;
     const PoolingType   pool_type       = pool_info.pool_type();
-    const int           pool_size_x     = pool_info.is_global_pooling() ? input->info()->dimension(0) : pool_info.pool_size().width;
-    const int           pool_size_y     = pool_info.is_global_pooling() ? input->info()->dimension(1) : pool_info.pool_size().height;
+    DataLayout          data_layout     = input->info()->data_layout();
+    const int           idx_width       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int           idx_height      = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int           idx_channel     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int           pool_size_x     = pool_info.is_global_pooling() ? input->info()->dimension(idx_width) : pool_info.pool_size().width;
+    const int           pool_size_y     = pool_info.is_global_pooling() ? input->info()->dimension(idx_height) : pool_info.pool_size().height;
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
     const bool          exclude_padding = pool_info.exclude_padding();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
     const int pool_pad_top  = pad_stride_info.pad_top();
     const int pool_pad_left = pad_stride_info.pad_left();
 
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
     // Check output dimensions
-    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
-                                                     input->info()->dimension(1),
-                                                     pool_size_x,
-                                                     pool_size_y,
-                                                     pad_stride_info);
-
-    auto_init(input->info(), output->info(), pooled_w, pooled_h);
-
+    auto_init(input->info(), output->info(), pool_info);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info));
 
     // Set instance variables
@@ -190,7 +207,7 @@
     _output    = output;
     _pool_info = pool_info;
 
-    const GPUTarget gpu_target = get_arch_from_target(get_target());
+    const GPUTarget gpu_target = get_target();
     const DataType  data_type  = input->info()->data_type();
 
     // Set build options
@@ -200,65 +217,93 @@
     build_opts.add_option_if(is_data_type_fixed_point(data_type),
                              "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
     build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
-    if(pool_type != PoolingType::MAX)
-    {
-        build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
-        build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_left)));
-        build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_top)));
-        build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
-        build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left));
-        build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
-    }
+    build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
+    build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left));
+    build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
+    build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
+    build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
+    build_opts.add_option_if(data_type == DataType::F16, "-DFP16");
 
     // Create kernel
-    if((pool_size_x == 3) && (pool_size_y == 3) && !is_data_type_quantized_asymmetric(data_type))
+    switch(data_layout)
     {
-        // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
-        // each thread computes 4 output elements
-        const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type);
+        case DataLayout::NCHW:
+        {
+            build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
+            build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
+            if(pool_type != PoolingType::MAX)
+            {
+                build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
+            }
 
-        std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_")
-                                  + support::cpp11::to_string(pool_size_x);
-        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-    }
-    else // Run general case
-    {
-        build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
-        build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
-        build_opts.add_option_if(data_type == DataType::F16, "-DFP16");
+            if((pool_size_x == 3) && (pool_size_y == 3) && !is_data_type_quantized_asymmetric(data_type))
+            {
+                // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
+                // each thread computes 4 output elements
+                const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type);
 
-        std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized" : "pooling_layer_MxN";
-        _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+                std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_")
+                                          + support::cpp11::to_string(pool_size_x);
+                _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+            }
+            else // Run general case
+            {
+                std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nchw" : "pooling_layer_MxN_nchw";
+                _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+            }
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
+            build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
+            build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
+            std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
+            _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
     }
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info);
 
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+    ICLKernel::configure(std::get<1>(win_config));
 
     // Configure the local work size (hint) from the first two dimensions of the global work size.
     // On Bifrost, this works for up to 35x35xC filters, for which the pooling_layer_3_optimized
     // kernel is launched with gws=(9, 33, C). In any case, the hint will be ignored if it is
     // invalid (e.g. exceeds the maximum workgroup size that the kernel can be launched with).
-    if(gpu_target == GPUTarget::BIFROST)
+    if(data_layout == DataLayout::NCHW)
     {
-        cl::NDRange gws = ICLKernel::gws_from_window(std::get<1>(win_config));
-        _lws_hint       = cl::NDRange(gws[0], gws[1], 1);
+        CLPoolingConfig pooling_config     = std::get<2>(win_config);
+        _num_elems_processed_per_iteration = pooling_config.first;
+        _border_size                       = pooling_config.second;
+        if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
+        {
+            cl::NDRange gws = ICLKernel::gws_from_window(std::get<1>(win_config));
+            _lws_hint       = cl::NDRange(gws[0], gws[1], 1);
+        }
     }
-
-    ICLKernel::configure(std::get<1>(win_config));
-
-    CLPoolingConfig pooling_config     = std::get<2>(win_config);
-    _num_elems_processed_per_iteration = pooling_config.first;
-    _border_size                       = pooling_config.second;
+    else
+    {
+        _border_size                       = BorderSize(1, 0, 0, 0);
+        _num_elems_processed_per_iteration = 8;
+    }
 
     // Set config_id for enabling LWS tuning
     _config_id = "pooling_layer_";
     _config_id += lower_string(string_from_data_type(data_type));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += lower_string(string_from_data_layout(data_layout));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+    _config_id += support::cpp11::to_string(output->info()->dimension(idx_width));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(idx_height));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(idx_channel));
 }
 
 Status CLPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
@@ -278,25 +323,52 @@
     unsigned int pool_stride_y = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
 
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice            = window_collapsed.first_slice_window_3D();
-
-    do
+    switch(_input->info()->data_layout())
     {
-        // Upsample input by pool size
-        Window in_slice(slice);
-        in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - _pool_info.pad_stride_info().pad_left(),
-                                                     (in_slice.x().end() - _pool_info.pad_stride_info().pad_left()) * pool_stride_x,
-                                                     pool_stride_x * _num_elems_processed_per_iteration));
-        in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - _pool_info.pad_stride_info().pad_top(),
-                                                     (in_slice.y().end() - _pool_info.pad_stride_info().pad_top()) * pool_stride_y,
-                                                     pool_stride_y));
+        case DataLayout::NCHW:
+        {
+            Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+            Window slice            = window_collapsed.first_slice_window_3D();
+            do
+            {
+                // Upsample input by pool size
+                Window in_slice(slice);
+                in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - _pool_info.pad_stride_info().pad_left(),
+                                                             (in_slice.x().end() - _pool_info.pad_stride_info().pad_left()) * pool_stride_x,
+                                                             pool_stride_x * _num_elems_processed_per_iteration));
+                in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - _pool_info.pad_stride_info().pad_top(),
+                                                             (in_slice.y().end() - _pool_info.pad_stride_info().pad_top()) * pool_stride_y,
+                                                             pool_stride_y));
 
-        // Set inputs
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, in_slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice, _lws_hint);
+                // Set inputs
+                unsigned int idx = 0;
+                add_3D_tensor_argument(idx, _input, in_slice);
+                add_3D_tensor_argument(idx, _output, slice);
+                enqueue(queue, *this, slice, _lws_hint);
+            }
+            while(window_collapsed.slide_window_slice_3D(slice));
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            Window slice = window.first_slice_window_3D();
+
+            Window in_slice = window.first_slice_window_3D();
+            in_slice.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration));
+            in_slice.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
+            in_slice.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
+            do
+            {
+                // Set inputs
+                unsigned int idx = 0;
+                add_3D_tensor_argument(idx, _input, in_slice);
+                add_3D_tensor_argument(idx, _output, slice);
+                enqueue(queue, *this, slice, _lws_hint);
+            }
+            while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(in_slice));
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
     }
-    while(window_collapsed.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
index 8b082a8..028e508 100644
--- a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp

@@ -34,6 +34,46 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
+
+    if(output->tensor_shape().total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
+{
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::U8, 0);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+    // Configure window
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     min_max_access(min_max, 0, 0, 2, min_max->dimension(1));
+
+    // Update window and padding
+    bool window_changed = update_window_and_padding(win, input_access, output_access, min_max_access);
+
+    output_access.set_valid_region(win, input->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_tuple(err, win);
+}
+} // namespace
+
 CLQuantizationLayerKernel::CLQuantizationLayerKernel()
     : _input(nullptr), _output(nullptr), _min_max(nullptr)
 {
@@ -41,37 +81,30 @@
 
 void CLQuantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *min_max)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output, min_max);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::U8, 0);
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), min_max->info()));
 
     _input   = input;
     _output  = output;
     _min_max = min_max;
 
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("quantization_layer"));
 
-    // Configure window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     min_max_access(min_max->info(), 0, 0, 2, min_max->info()->dimension(1));
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), min_max->info());
 
-    // Update window and padding
-    update_window_and_padding(win, input_access, output_access, min_max_access);
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
-    output_access.set_valid_region(win, input->info()->valid_region());
+    ICLKernel::configure(std::get<1>(win_config));
+}
 
-    ICLKernel::configure(win);
+Status CLQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, min_max));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), min_max->clone().get())));
+
+    return Status{};
 }
 
 void CLQuantizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 1dd5eb9..25b756b 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp

@@ -38,6 +38,52 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+    ARM_COMPUTE_UNUSED(op);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
+    }
+
+    return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis)
+{
+    // Output tensor auto initialization if not yet initialized
+    TensorShape output_shape{ input->tensor_shape() };
+    output_shape.set(axis, 1);
+    auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->fixed_point_position());
+
+    const unsigned int num_elems_processed_per_iteration = 16;
+
+    Window             win          = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    const unsigned int border_width = ((input->dimension(0) % 128) != 0) ? 128 - input->dimension(0) % 128 : 0;
+
+    AccessWindowStatic     input_access(input, 0, 0, input->dimension(0) + border_width, 1);
+    AccessWindowHorizontal output_access(output, 0, 1);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, output->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+
+    return std::make_tuple(err, win);
+}
+} // namespace
+
 CLReductionOperationKernel::CLReductionOperationKernel()
     : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE), _border_size()
 {
@@ -50,17 +96,9 @@
 
 void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    // Output tensor auto initialization if not yet initialized
-    TensorShape output_shape{ input->info()->tensor_shape() };
-    output_shape.set(axis, 1);
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
-    ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
 
     const unsigned int num_elems_processed_per_iteration = 16;
     const unsigned int border_width                      = ((input->info()->dimension(0) % 128) != 0) ? 128 - input->info()->dimension(0) % 128 : 0;
@@ -97,15 +135,19 @@
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reduction_operation", build_opts));
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
 
-    AccessWindowStatic     input_access(input->info(), 0, 0, input->info()->dimension(0) + border_width, 1);
-    AccessWindowHorizontal output_access(output->info(), 0, 1);
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
-    update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, output->info()->valid_region());
+    ICLKernel::configure(std::get<1>(win_config));
+}
 
-    ICLKernel::configure(win);
+Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
+
+    return Status{};
 }
 
 void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index 673304a..9b8a582 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -102,7 +102,7 @@
     output_access.set_valid_region(win, calculate_valid_region_scale(*(input->info()),
                                                                      output->info()->tensor_shape(),
                                                                      policy,
-                                                                     border,
+                                                                     sampling_policy,
                                                                      border_undefined));
 
     ICLKernel::configure(win);

diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
index 3b5fbc9..b80a612 100644
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 

diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
new file mode 100644
index 0000000..b8bce38
--- /dev/null
+++ b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp

@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+#include <map>
+
+using namespace arm_compute;
+namespace
+{
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int width_offset, ITensorInfo *output)
+{
+    const unsigned int num_elems_processed_per_iteration = 16;
+
+    // The window needs to be based on input as we copy all the widths of input
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, width_offset, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+Status validate_arguments(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::QS16, DataType::F16, DataType::U32,
+                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) + width_offset > output->dimension(0));
+
+    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 3);
+
+    return Status{};
+}
+} // namespace
+
+CLWidthConcatenateLayerKernel::CLWidthConcatenateLayerKernel()
+    : _input(nullptr), _output(nullptr), _width_offset(0)
+{
+}
+
+Status CLWidthConcatenateLayerKernel::validate(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, width_offset, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), width_offset, output->clone().get()).first);
+    return Status{};
+}
+
+void CLWidthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int width_offset, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), width_offset, output->info()));
+
+    _input        = input;
+    _output       = output;
+    _width_offset = width_offset;
+
+    const unsigned int num_elems_processed_per_iteration = 16;
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width", build_opts.options()));
+
+    const int offset_to_first_elements_in_bytes = _width_offset * _output->info()->strides_in_bytes()[0];
+
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+    _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), width_offset, output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+    ICLKernel::configure(std::get<1>(win_config));
+}
+
+void CLWidthConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
new file mode 100644
index 0000000..41b3ac5
--- /dev/null
+++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp

@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+
+    const Size2D kernel_size      = winograd_info.kernel_size;
+    const Size2D output_tile_size = winograd_info.output_tile_size;
+
+    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Winograd filter transform only supports 3x3 and 5x5 kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U)
+                                    && output_tile_size != Size2D(4U, 4U),
+                                    "Winograd filter transform only supports 2x2 or 4x4 output tile for 3x3 kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U), "Winograd filter transform only supports 4x4 output tile for 5x5 kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info));
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    const unsigned int num_elems_processed_per_iteration_x = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH));
+    const unsigned int num_elems_processed_per_iteration_y = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT));
+
+    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    bool   window_changed = false;
+
+    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    AccessWindowStatic    output_access(output, 0, 0, output->dimension(0), output->dimension(1));
+    window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+    Window win_collapsed = win.collapse(win, Window::DimZ);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win_collapsed);
+}
+} // namespace
+
+CLWinogradFilterTransformKernel::CLWinogradFilterTransformKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLWinogradFilterTransformKernel::configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input->info(), winograd_info)));
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), winograd_info));
+
+    const size_t idx_c = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DNUM_CHANNELS=" + support::cpp11::to_string(input->info()->dimension(idx_c)));
+
+    const Size2D kernel_size      = winograd_info.kernel_size;
+    const Size2D output_tile_size = winograd_info.output_tile_size;
+
+    // Create kernel
+    std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_nchw";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+}
+
+Status CLWinogradFilterTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, winograd_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+
+    return Status{};
+}
+
+void CLWinogradFilterTransformKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Setup output window
+    Window window_out;
+    window_out.use_tensor_dimensions(_output->info()->tensor_shape(), 0);
+
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, window);
+    add_3D_tensor_argument(idx, _output, window_out);
+    enqueue(queue, *this, window);
+}

diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
new file mode 100644
index 0000000..febd22b
--- /dev/null
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp

@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+
+    const PadStrideInfo conv_info        = winograd_info.convolution_info;
+    const Size2D        output_tile_size = winograd_info.output_tile_size;
+    const Size2D        kernel_size      = winograd_info.kernel_size;
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Winograd input transform only supports 3x3 and 5x5 kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U)
+                                    && output_tile_size != Size2D(4U, 4U),
+                                    "Winograd input transform only supports 2x2 or 4x4 output tile for 3x3 kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U), "Winograd input transform only supports 4x4 output tile for 5x5 kernels");
+    ARM_COMPUTE_UNUSED(conv_info);
+    ARM_COMPUTE_UNUSED(output_tile_size);
+    ARM_COMPUTE_UNUSED(kernel_size);
+
+    // Validate configured output
+    if(output->total_size() != 0)
+    {
+        const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_UNUSED(output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    const PadStrideInfo conv_info        = winograd_info.convolution_info;
+    const Size2D        output_tile_size = winograd_info.output_tile_size;
+    const Size2D        kernel_size      = winograd_info.kernel_size;
+
+    const unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1;
+    const unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1;
+
+    Window win = calculate_max_window(*input, Steps(1, 1));
+
+    AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
+
+    bool window_changed = update_window_and_padding(win, input_access);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLWinogradInputTransformKernel::CLWinogradInputTransformKernel()
+    : _border_size(0), _input(nullptr), _output(nullptr), _num_tiles_x(0), _num_tiles_y(0), _step_z(1)
+{
+}
+
+BorderSize CLWinogradInputTransformKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), winograd_info));
+
+    const PadStrideInfo conv_info        = winograd_info.convolution_info;
+    const Size2D        output_tile_size = winograd_info.output_tile_size;
+    const Size2D        kernel_size      = winograd_info.kernel_size;
+
+    // Compute number of elements to process in the X and Y direction
+    const int num_elements_x = input->info()->dimension(0) - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right();
+    const int num_elements_y = input->info()->dimension(1) - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom();
+
+    // Check if we need to extend the right or bottom border
+    const unsigned int extra_border_right  = ((num_elements_x % output_tile_size.width) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.width - 1);
+    const unsigned int extra_border_bottom = ((num_elements_y % output_tile_size.height) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.height - 1);
+
+    _input       = input;
+    _output      = output;
+    _border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right() + extra_border_right, conv_info.pad_bottom() + extra_border_bottom, conv_info.pad_left());
+    _num_tiles_x = std::ceil(num_elements_x / static_cast<float>(output_tile_size.width));
+    _num_tiles_y = std::ceil(num_elements_y / static_cast<float>(output_tile_size.height));
+
+    const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input->info(), winograd_info);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+    ARM_COMPUTE_ERROR_ON(_num_tiles_x * _num_tiles_y != static_cast<int>(output->info()->dimension(1)));
+
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
+    build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+    build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+
+    // Create kernel
+    std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
+
+    // Check optimized kernel if output_dims == 2x2
+    if(output_tile_size == Size2D(2U, 2U))
+    {
+        _step_z = (_input->info()->dimension(2) % 2) != 0 ? 1 : 2;
+    }
+
+    _lws_hint = cl::NDRange(1, 1, 8);
+
+    // Append stepz and data layout
+    kernel_name += "_stepz";
+    kernel_name += support::cpp11::to_string(_step_z);
+    kernel_name += "_nchw";
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Create window and update padding
+    auto win_config = validate_and_configure_window(input->info(), output->info(), winograd_info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+
+    _config_id = kernel_name;
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(conv_info.pad_left());
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(conv_info.pad_top());
+}
+
+Status CLWinogradInputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, winograd_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), winograd_info).first);
+
+    return Status{};
+}
+
+void CLWinogradInputTransformKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+    slice.set(Window::DimX, Window::Dimension(0, _num_tiles_x, 1));
+    slice.set(Window::DimY, Window::Dimension(0, _num_tiles_y, 1));
+
+    ARM_COMPUTE_ERROR_ON(((slice.z().end() - slice.z().start()) % _step_z) != 0);
+    slice.set(Window::DimZ, Window::Dimension(slice.z().start(), slice.z().end(), _step_z));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+
+        enqueue(queue, *this, slice, _lws_hint);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
new file mode 100644
index 0000000..5c0a735
--- /dev/null
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp

@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(winograd_info.output_data_layout != DataLayout::NCHW);
+
+    const PadStrideInfo conv_info        = winograd_info.convolution_info;
+    const Size2D        output_tile_size = winograd_info.output_tile_size;
+    const Size2D        kernel_size      = winograd_info.kernel_size;
+    const Size2D        input_dimensions = winograd_info.input_dimensions;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Only 3x3 and 5x5 kernels are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size == Size2D(2U, 2U) && input->dimension(2) != 16, "Wrong number of batches");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size == Size2D(4U, 4U) && input->dimension(2) != 36, "Wrong number of batches");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size == Size2D(4U, 4U) && input->dimension(2) != 64, "Wrong number of batches");
+
+    // Compute number of elements to process in the X and Y direction
+    const int num_elements_x = input_dimensions.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right();
+    const int num_elements_y = input_dimensions.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom();
+    const int num_tiles_x    = std::ceil(num_elements_x / static_cast<float>(output_tile_size.width));
+    const int num_tiles_y    = std::ceil(num_elements_y / static_cast<float>(output_tile_size.height));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles_x * num_tiles_y)));
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+    }
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info));
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const Size2D &output_tile_size)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    bool   window_changed = false;
+
+    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+    AccessWindowStatic    output_access(output, 0, 0, ceil_to_multiple(output->dimension(0), output_tile_size.width), ceil_to_multiple(output->dimension(1), output_tile_size.height));
+
+    if(bias != nullptr)
+    {
+        AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
+        window_changed = update_window_and_padding(win, input_access, bias_access, output_access);
+    }
+    else
+    {
+        window_changed = update_window_and_padding(win, input_access, output_access);
+    }
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLWinogradOutputTransformKernel::CLWinogradOutputTransformKernel()
+    : _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+void CLWinogradOutputTransformKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input->info(), winograd_info)));
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), winograd_info));
+
+    _input  = input;
+    _bias   = bias;
+    _output = output;
+
+    // Compute num_tiles_x
+    const Size2D        input_dimensions = winograd_info.input_dimensions;
+    const Size2D        kernel_size      = winograd_info.kernel_size;
+    const Size2D        output_tile_size = winograd_info.output_tile_size;
+    const PadStrideInfo conv_info        = winograd_info.convolution_info;
+    const int           num_elements_x   = input_dimensions.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right();
+    const int           num_tiles_x      = std::ceil(num_elements_x / static_cast<float>(output_tile_size.width));
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(_bias != nullptr, std::string("-DHAS_BIAS"));
+    build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles_x));
+
+    // Create kernel
+    std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_nchw";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), winograd_info.output_tile_size);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+Status CLWinogradOutputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, (bias != nullptr ? bias->clone().get() : nullptr), output, winograd_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), output->clone().get(), winograd_info.output_tile_size).first);
+
+    return Status{};
+}
+
+void CLWinogradOutputTransformKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Get initial windows
+    Window slice = window.first_slice_window_3D();
+    slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    // Setup output slice
+    Window slice_out(slice);
+    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    if(_bias != nullptr)
+    {
+        unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
+        Window       slice_biases;
+        slice_biases.use_tensor_dimensions(_bias->info()->tensor_shape());
+        add_1D_tensor_argument(idx1, _bias, slice_biases);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice, _lws_hint);
+    }
+    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
+}

diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp
new file mode 100644
index 0000000..9c2b41b
--- /dev/null
+++ b/src/core/CPP/CPPTypes.cpp

@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CPP/CPPTypes.h"
+
+#include "arm_compute/core/Error.h"
+
+#ifndef BARE_METAL
+#include <sched.h>
+#endif /* defined(BARE_METAL) */
+
+using namespace arm_compute;
+
+void CPUInfo::set_fp16(const bool fp16)
+{
+    _fp16 = fp16;
+}
+
+void CPUInfo::set_dotprod(const bool dotprod)
+{
+    _dotprod = dotprod;
+}
+
+void CPUInfo::set_cpu_model(unsigned int cpuid, CPUModel model)
+{
+    ARM_COMPUTE_ERROR_ON(cpuid >= _percpu.size());
+    if(_percpu.size() > cpuid)
+    {
+        _percpu[cpuid] = model;
+    }
+}
+
+bool CPUInfo::has_fp16() const
+{
+    return _fp16;
+}
+
+bool CPUInfo::has_dotprod() const
+{
+    return _dotprod;
+}
+
+CPUModel CPUInfo::get_cpu_model(unsigned int cpuid) const
+{
+    if(cpuid < _percpu.size())
+    {
+        return _percpu[cpuid];
+    }
+    return CPUModel::GENERIC;
+}
+
+unsigned int CPUInfo::get_L1_cache_size() const
+{
+    return _L1_cache_size;
+}
+
+void CPUInfo::set_L1_cache_size(unsigned int size)
+{
+    _L1_cache_size = size;
+}
+
+unsigned int CPUInfo::get_L2_cache_size() const
+{
+    return _L2_cache_size;
+}
+
+void CPUInfo::set_L2_cache_size(unsigned int size)
+{
+    _L2_cache_size = size;
+}
+
+void CPUInfo::set_cpu_num(unsigned int cpu_count)
+{
+    _percpu.resize(cpu_count);
+}
+
+CPUInfo::CPUInfo()
+    : _percpu(1)
+{
+    // The core library knows nothing about the CPUs so we set only 1 CPU to be generic.
+    // The runtime NESCheduler will initialise this vector with the correct CPU models.
+    // See void detect_cpus_configuration(CPUInfo &cpuinfo) in CPPUtils.h
+    _percpu[0] = CPUModel::GENERIC;
+}
+
+CPUModel CPUInfo::get_cpu_model() const
+{
+#if defined(BARE_METAL) || (!defined(__arm__) && !defined(__aarch64__))
+    return get_cpu_model(0);
+#else  /* defined(BARE_METAL) || (!defined(__arm__) && !defined(__aarch64__)) */
+    return get_cpu_model(sched_getcpu());
+#endif /* defined(BARE_METAL) || (!defined(__arm__) && !defined(__aarch64__)) */
+}

diff --git a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
index 62a2477..5037ac5 100644
--- a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,26 @@
 {
 bool compare_detection_window(const DetectionWindow &lhs, const DetectionWindow &rhs)
 {
-    return lhs.score > rhs.score;
+    if(lhs.idx_class < rhs.idx_class)
+    {
+        return true;
+    }
+    if(rhs.idx_class < lhs.idx_class)
+    {
+        return false;
+    }
+
+    // idx_classes are equal so compare by score
+    if(lhs.score > rhs.score)
+    {
+        return true;
+    }
+    if(rhs.score > lhs.score)
+    {
+        return false;
+    }
+
+    return false;
 }
 } // namespace
 
@@ -70,7 +89,7 @@
     const size_t num_candidates = _input_output->num_values();
     size_t       num_detections = 0;
 
-    // Sort list of candidates
+    // Sort list of candidates by idx_class and then score
     std::sort(_input_output->buffer(), _input_output->buffer() + num_candidates, compare_detection_window);
 
     const float min_distance_pow2 = _min_distance * _min_distance;
@@ -96,7 +115,7 @@
             const float xc = cur.x + cur.width * 0.5f;
             const float yc = cur.y + cur.height * 0.5f;
 
-            for(size_t k = i + 1; k < num_candidates; ++k)
+            for(size_t k = i + 1; k < (num_candidates) && (cur.idx_class == _input_output->at(k).idx_class); ++k)
             {
                 const float xn = _input_output->at(k).x + _input_output->at(k).width * 0.5f;
                 const float yn = _input_output->at(k).y + _input_output->at(k).height * 0.5f;
@@ -110,7 +129,7 @@
 
                     if(d < min_distance_pow2)
                     {
-                        // Invalidate keypoint
+                        // Invalidate detection window
                         _input_output->at(k).score = 0.0f;
                     }
                 }

diff --git a/src/core/CPP/kernels/CPPUpsampleKernel.cpp b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
new file mode 100644
index 0000000..d77d9c1
--- /dev/null
+++ b/src/core/CPP/kernels/CPPUpsampleKernel.cpp

@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+CPPUpsampleKernel::CPPUpsampleKernel()
+    : _input(nullptr), _output(nullptr), _info(), _inner_border()
+{
+}
+
+bool CPPUpsampleKernel::is_parallelisable() const
+{
+    return false;
+}
+
+void CPPUpsampleKernel::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info, unsigned int inner_border_right, unsigned int inner_border_top)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    _input        = input;
+    _output       = output;
+    _info         = info;
+    _inner_border = std::make_pair(inner_border_right, inner_border_top);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+
+    // The CPPUpsampleKernel doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+    ICPPKernel::configure(win);
+}
+
+void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+    // Initialize _scaled_output buffer
+    const int width_scaled  = _output->info()->dimension(0);
+    const int height_scaled = _output->info()->dimension(1);
+    const int stride_x      = _info.stride().first;
+    const int stride_y      = _info.stride().second;
+    const int start_x       = _info.pad().first;
+    const int start_y       = _inner_border.second + _info.pad().second;
+    const int end_y         = height_scaled - _info.pad().second;
+    const int end_x         = width_scaled - _inner_border.first - _info.pad().first;
+
+    std::fill_n(_output->buffer(), _output->info()->total_size(), 0);
+
+    // Create window
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(start_x, end_x, stride_x));
+    window_out.set(Window::DimY, Window::Dimension(start_y, end_y, stride_y));
+
+    // Create iterators
+    Iterator in(_input, window);
+    Iterator out(_output, window_out);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        *(reinterpret_cast<float *>(out.ptr())) = *(reinterpret_cast<const float *>(in.ptr()));
+    },
+    in, out);
+}

diff --git a/src/graph/CL/CLUnmap.cpp b/src/core/GLES_COMPUTE/GCHelpers.cpp
similarity index 66%
copy from src/graph/CL/CLUnmap.cpp
copy to src/core/GLES_COMPUTE/GCHelpers.cpp
index 31f2f19..8970688 100644
--- a/src/graph/CL/CLUnmap.cpp
+++ b/src/core/GLES_COMPUTE/GCHelpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,23 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/CL/CLUnmap.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute::graph;
-
-CLUnmap::CLUnmap(ITensorObject *tensor)
-    : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor()))
+namespace arm_compute
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
-}
-
-void CLUnmap::run()
+GPUTarget get_target_from_device()
 {
-    _tensor->unmap(arm_compute::CLScheduler::get().queue());
+    const std::string device_name = reinterpret_cast<const char *>(glGetString(GL_RENDERER));
+
+    return get_target_from_name(device_name);
 }
+} // namespace arm_compute

diff --git a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
index d4ce388..25ac02e 100644
--- a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
+++ b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp

@@ -152,9 +152,9 @@
     ARM_COMPUTE_GL_CHECK(glGenBuffers(1, &_shader_params_ubo_name));
 
     _shader_params_index = ARM_COMPUTE_GL_CHECK(glGetUniformBlockIndex(_program, _shader_params_name));
-    ARM_COMPUTE_ERROR_ON_MSG((_shader_params_index == GL_INVALID_INDEX), "Failed to get index of %s", _shader_params_name);
+    ARM_COMPUTE_ERROR_ON_MSG(_shader_params_index == GL_INVALID_INDEX, "Failed to get index of %s", _shader_params_name);
     ARM_COMPUTE_GL_CHECK(glGetActiveUniformBlockiv(_program, _shader_params_index, GL_UNIFORM_BLOCK_DATA_SIZE, &_shader_params_size));
-    ARM_COMPUTE_ERROR_ON_MSG((_shader_params_size == 0), "Failed to get size of %s", _shader_params_name);
+    ARM_COMPUTE_ERROR_ON_MSG(_shader_params_size == 0, "Failed to get size of %s", _shader_params_name);
 }
 
 void GCKernel::cleanup()
@@ -232,6 +232,14 @@
 {
 #ifdef EMBEDDED_KERNELS
     {
+        "helpers_cs.h",
+#include "./cs_shaders/helpers_cs.hembed"
+    },
+    {
+        "activation_layer_helpers_cs.h",
+#include "./cs_shaders/activation_layer_helpers_cs.hembed"
+    },
+    {
         "absdiff.cs",
 #include "./cs_shaders/absdiff.csembed"
     },

diff --git a/src/core/GLES_COMPUTE/IGCKernel.cpp b/src/core/GLES_COMPUTE/IGCKernel.cpp
index 55b7f0d..ecd63b5 100644
--- a/src/core/GLES_COMPUTE/IGCKernel.cpp
+++ b/src/core/GLES_COMPUTE/IGCKernel.cpp

@@ -62,7 +62,7 @@
 }
 
 IGCKernel::IGCKernel()
-    : _kernel(), _lws_hint(gles::NDRange(1U, 1U, 1U))
+    : _kernel(), _lws_hint(gles::NDRange(1U, 1U, 1U)), _target(GPUTarget::MIDGARD)
 {
 }
 

diff --git a/src/core/GLES_COMPUTE/OpenGLES.cpp b/src/core/GLES_COMPUTE/OpenGLES.cpp
index d2539d0..e93b360 100644
--- a/src/core/GLES_COMPUTE/OpenGLES.cpp
+++ b/src/core/GLES_COMPUTE/OpenGLES.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,7 +66,8 @@
 using glMemoryBarrier_func           = void GL_APIENTRY (*)(GLbitfield barriers);
 using glUniform1ui_func              = void GL_APIENTRY (*)(GLint location, GLuint v0);
 using glUnmapBuffer_func             = GLboolean GL_APIENTRY (*)(GLenum target);
-using glGetError_func                = GLenum              GL_APIENTRY (*)();
+using glGetError_func                = GLenum          GL_APIENTRY (*)();
+using glGetString_func               = const GLubyte * GL_APIENTRY (*)(GLenum name);
 using glGetActiveUniformBlockiv_func = void GL_APIENTRY (*)(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params);
 using glUniformBlockBinding_func     = void GL_APIENTRY (*)(GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
 using glGetUniformBlockIndex_func    = GLuint GL_APIENTRY (*)(GLuint program, const GLchar *uniformBlockName);
@@ -668,6 +669,19 @@
     }
 }
 
+const GLubyte *GL_APIENTRY glGetString(GLenum name)
+{
+    auto func = GLESSymbols::get().glGetString;
+    if(func != nullptr)
+    {
+        return func(name);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
 void GL_APIENTRY glGetActiveUniformBlockiv(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params)
 {
     auto func = GLESSymbols::get().glGetActiveUniformBlockiv;

diff --git a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
index 7d3f4ee..9a1e233 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,97 +23,9 @@
  */
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
 
+#include "activation_layer_helpers_cs.h"
 #include "helpers_cs.h"
 
-#ifdef DATA_TYPE_FP32
-precision highp float;
-#elif defined(DATA_TYPE_FP16)
-#if defined(LOGISTIC) || defined(TANH) || defined(SRELU) || defined(SQRT)
-precision highp float;
-#else  /*LOGISTIC_TANH_SRELU_SQRT*/
-precision mediump float;
-#endif /*LOGISTIC_TANH_SRELU_SQRT*/
-#endif /*DATA_TYPE_FP32*/
-
-#define ABS_OP(a) abs((a))
-#define ADD_OP(a, b) ((a) + (b))
-#define SUB_OP(a, b) ((a) - (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define MLA_OP(a, b, c) ((b) * (c) + (a))
-#define DIV_OP(a, b) ((a) / (b))
-#define EXP_OP(a) exp((a))
-#define LOG_OP(a) log((a))
-#define SQRT_OP(a) sqrt((a))
-#define CONST_ONE (1.f)
-
-// Logistic Activation
-float logistic_op(float x)
-{
-    return DIV_OP(CONST_ONE, ADD_OP(CONST_ONE, EXP_OP(-x)));
-}
-// Hyperbolic Tangent Activation
-float tanh_op(float x)
-{
-    float tmp = float(B_VAL) * x;
-    if(tmp > 10.f)
-    {
-        return MUL_OP(float(A_VAL), 1.f);
-    }
-    else if(tmp < -10.f)
-    {
-        return MUL_OP(float(A_VAL), -1.f);
-    }
-    else
-    {
-        return MUL_OP(float(A_VAL), tanh(tmp + 0.000001f));
-    }
-}
-// RELU Tangent Activation
-float relu_op(float x)
-{
-    return max(0.f, x);
-}
-// Bounded RELU Activation
-float brelu_op(float x)
-{
-    return min(float(A_VAL), max(float(0.0), x));
-}
-// Lower Upper Bounded RELU Activation
-float lu_brelu_op(float x)
-{
-    return min(max(x, float(B_VAL)), float(A_VAL));
-}
-// Leaky RELU Activation
-float lrelu_op(float x)
-{
-    return (x > float(0.0)) ? x : MUL_OP(float(A_VAL), x);
-}
-// Soft RELU Activation
-float srelu_op(float x)
-{
-    return LOG_OP(ADD_OP(CONST_ONE, EXP_OP(x)));
-}
-// Absolute Activation
-float abs_op(float x)
-{
-    return ABS_OP(x);
-}
-// Square Activation
-float square_op(float x)
-{
-    return MUL_OP(x, x);
-}
-// Square-root Activation
-float sqrt_op(float x)
-{
-    return SQRT_OP(x);
-}
-// Linear Activation
-float linear_op(float x)
-{
-    return MLA_OP(float(B_VAL), float(A_VAL), x);
-}
-
 /** This performs an activation function floating point inputs.
  *
  * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"

diff --git a/src/core/GLES_COMPUTE/cs_shaders/activation_layer_helpers_cs.h b/src/core/GLES_COMPUTE/cs_shaders/activation_layer_helpers_cs.h
new file mode 100644
index 0000000..f43a33f
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/activation_layer_helpers_cs.h

@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef DATA_TYPE_FP32
+precision highp float;
+#elif defined(DATA_TYPE_FP16)
+#if defined(LOGISTIC) || defined(TANH) || defined(SRELU) || defined(SQRT)
+precision highp float;
+#else  /*LOGISTIC_TANH_SRELU_SQRT*/
+precision mediump float;
+#endif /*LOGISTIC_TANH_SRELU_SQRT*/
+#endif /*DATA_TYPE_FP32*/
+
+#define ABS_OP(a) abs((a))
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define MLA_OP(a, b, c) ((b) * (c) + (a))
+#define DIV_OP(a, b) ((a) / (b))
+#define EXP_OP(a) exp((a))
+#define LOG_OP(a) log((a))
+#define SQRT_OP(a) sqrt((a))
+#define CONST_ONE (1.f)
+
+// Logistic Activation
+float logistic_op(float x)
+{
+    return DIV_OP(CONST_ONE, ADD_OP(CONST_ONE, EXP_OP(-x)));
+}
+vec4 logistic_op(vec4 x)
+{
+    return DIV_OP(vec4(CONST_ONE), ADD_OP(CONST_ONE, EXP_OP(-x)));
+}
+// Hyperbolic Tangent Activation
+float tanh_op(float x)
+{
+    float tmp = float(B_VAL) * x;
+    if(tmp > 10.f)
+    {
+        return MUL_OP(float(A_VAL), 1.f);
+    }
+    else if(tmp < -10.f)
+    {
+        return MUL_OP(float(A_VAL), -1.f);
+    }
+    else
+    {
+        return MUL_OP(float(A_VAL), tanh(tmp + 0.000001f));
+    }
+}
+// RELU Tangent Activation
+float relu_op(float x)
+{
+    return max(0.f, x);
+}
+vec4 relu_op(vec4 x)
+{
+    return max(vec4(0.f), x);
+}
+// Bounded RELU Activation
+float brelu_op(float x)
+{
+    return min(float(A_VAL), max(float(0.0), x));
+}
+// Lower Upper Bounded RELU Activation
+float lu_brelu_op(float x)
+{
+    return min(max(x, float(B_VAL)), float(A_VAL));
+}
+// Leaky RELU Activation
+float lrelu_op(float x)
+{
+    return (x > float(0.0)) ? x : MUL_OP(float(A_VAL), x);
+}
+// Soft RELU Activation
+float srelu_op(float x)
+{
+    return LOG_OP(ADD_OP(CONST_ONE, EXP_OP(x)));
+}
+// Absolute Activation
+float abs_op(float x)
+{
+    return ABS_OP(x);
+}
+// Square Activation
+float square_op(float x)
+{
+    return MUL_OP(x, x);
+}
+// Square-root Activation
+float sqrt_op(float x)
+{
+    return SQRT_OP(x);
+}
+// Linear Activation
+float linear_op(float x)
+{
+    return MLA_OP(float(B_VAL), float(A_VAL), x);
+}

diff --git a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
index 7629b25..81be967 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs

@@ -50,6 +50,8 @@
  *
  * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
  * @note Epsilon parameter in the batch normalization equation should be given as a preprocessor argument using "#define EPSILON". e.g. "#define EPSILON 0.1"
+ * @note Beta is optional with default value of 0. If not provided, the preprocessor argument "USE_DEFAULT_BETA" should be given
+ * @note Gamma is optional with default value of 1. If not provided, the preprocessor argument "USE_DEFAULT_GAMMA" should be given
  *
  * @param[in]  src_ptr     Pointer to the first source tensor. Supported data types: F16/F32
  * @param[in]  src_attrs   The attributes of the source tensor
@@ -59,10 +61,10 @@
  * @param[in]  mean_attrs  The attributes of the mean tensor
  * @param[in]  var_ptr     Pointer to the var tensor. Supported data types: same as @p src_ptr
  * @param[in]  var_attrs   The attributes of the var tensor
- * @param[in]  beta_ptr    Pointer to the beta source tensor. Supported data types: same as @p src_ptr
- * @param[in]  beta_attrs  The attributes of the beta tensor
- * @param[in]  gamma_ptr   Pointer to the gamma source tensor. Supported data types: same as @p src_ptr
- * @param[in]  gamma_attrs The attributes of the gamma tensor
+ * @param[in]  beta_ptr    (Optional) Pointer to the beta source tensor. If not provided, default value of beta is 0. Supported data types: same as @p src_ptr
+ * @param[in]  beta_attrs  (Optional) The attributes of the beta tensor
+ * @param[in]  gamma_ptr   (Optional) Pointer to the gamma source tensor. If not provided, default value of gamma is 1. Supported data types: same as @p src_ptr
+ * @param[in]  gamma_attrs (Optional) The attributes of the gamma tensor
  */
 SHADER_PARAMS_DECLARATION
 {
@@ -70,8 +72,12 @@
     Tensor3DAttributes dst_attrs;
     VectorAttributes   mean_attrs;
     VectorAttributes   var_attrs;
-    VectorAttributes   beta_attrs;
-    VectorAttributes   gamma_attrs;
+#ifndef USE_DEFAULT_BETA
+    VectorAttributes beta_attrs;
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+    VectorAttributes gamma_attrs;
+#endif /* USE_DEFAULT_GAMMA */
 };
 
 #ifdef DATA_TYPE_FP32
@@ -79,24 +85,34 @@
 TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
 TENSOR_DECLARATION(3, meanBuffer, float, mean_ptr, mean_shift, 2, readonly);
 TENSOR_DECLARATION(4, varBuffer, float, var_ptr, var_shift, 2, readonly);
+#ifndef USE_DEFAULT_BETA
 TENSOR_DECLARATION(5, betaBuffer, float, beta_ptr, beta_shift, 2, readonly);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+#ifdef USE_DEFAULT_BETA
+TENSOR_DECLARATION(5, gammaBuffer, float, gamma_ptr, gamma_shift, 2, readonly);
+#else  /* USE_DEFAULT_BETA */
 TENSOR_DECLARATION(6, gammaBuffer, float, gamma_ptr, gamma_shift, 2, readonly);
+#endif /* USE_DEFAULT_BETA */
+#endif /* USE_DEFAULT_GAMMA */
 
 void main(void)
 {
-    Tensor3DIterator src_iter   = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator dst_iter   = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-    VectorIterator   mean_iter  = CONVERT_TO_VECTOR_ITERATOR(mean_attrs, mean_shift);
-    VectorIterator   var_iter   = CONVERT_TO_VECTOR_ITERATOR(var_attrs, var_shift);
-    VectorIterator   beta_iter  = CONVERT_TO_VECTOR_ITERATOR(beta_attrs, beta_shift);
-    VectorIterator   gamma_iter = CONVERT_TO_VECTOR_ITERATOR(gamma_attrs, gamma_shift);
+    Tensor3DIterator src_iter  = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+    VectorIterator   mean_iter = CONVERT_TO_VECTOR_ITERATOR(mean_attrs, mean_shift);
+    VectorIterator   var_iter  = CONVERT_TO_VECTOR_ITERATOR(var_attrs, var_shift);
+#ifndef USE_DEFAULT_BETA
+    VectorIterator beta_iter = CONVERT_TO_VECTOR_ITERATOR(beta_attrs, beta_shift);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+    VectorIterator gamma_iter = CONVERT_TO_VECTOR_ITERATOR(gamma_attrs, gamma_shift);
+#endif /* USE_DEFAULT_GAMMA */
 
     float input_value = 0.f;
     float denominator = 0.f;
     float numerator   = 0.f;
     float x_bar       = 0.f;
-    float gamma_param = 0.f;
-    float beta_param  = 0.f;
 
     uint current_slice = gl_GlobalInvocationID.z;
 
@@ -109,10 +125,18 @@
     numerator = SUB_OP(input_value, numerator);
     x_bar     = MUL_OP(numerator, denominator);
 
-    gamma_param = LOAD(gamma_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(gamma_iter, current_slice * beta_attrs.stride_x));
-    beta_param  = LOAD(beta_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(beta_iter, current_slice * beta_attrs.stride_x));
+#ifndef USE_DEFAULT_GAMMA
+    float gamma_param = LOAD(gamma_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(gamma_iter, current_slice * gamma_attrs.stride_x));
 
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, ACTIVATION_FUNC(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param)));
+    x_bar = MUL_OP(gamma_param, x_bar);
+#endif /* USE_DEFAULT_GAMMA */
+#ifndef USE_DEFAULT_BETA
+    float beta_param = LOAD(beta_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(beta_iter, current_slice * beta_attrs.stride_x));
+
+    x_bar = ADD_OP(x_bar, beta_param);
+#endif /* USE_DEFAULT_BETA */
+
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, ACTIVATION_FUNC(x_bar));
 }
 
 #elif defined(DATA_TYPE_FP16)
@@ -120,8 +144,16 @@
 TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
 TENSOR_DECLARATION(3, meanBuffer, uvec2, mean_ptr, mean_shift, 3, readonly);
 TENSOR_DECLARATION(4, varBuffer, uvec2, var_ptr, var_shift, 3, readonly);
+#ifndef USE_DEFAULT_BETA
 TENSOR_DECLARATION(5, betaBuffer, uvec2, beta_ptr, beta_shift, 3, readonly);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+#ifdef USE_DEFAULT_BETA
+TENSOR_DECLARATION(5, gammaBuffer, uvec2, gamma_ptr, gamma_shift, 3, readonly);
+#else  /* USE_DEFAULT_BETA */
 TENSOR_DECLARATION(6, gammaBuffer, uvec2, gamma_ptr, gamma_shift, 3, readonly);
+#endif /* USE_DEFAULT_BETA */
+#endif /* USE_DEFAULT_GAMMA */
 
 void main(void)
 {
@@ -129,14 +161,18 @@
     Tensor3DIterator dst_iter   = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
     VectorIterator   mean_iter  = CONVERT_TO_VECTOR_ITERATOR(mean_attrs, mean_shift);
     VectorIterator   var_iter   = CONVERT_TO_VECTOR_ITERATOR(var_attrs, var_shift);
+#ifndef USE_DEFAULT_BETA
     VectorIterator   beta_iter  = CONVERT_TO_VECTOR_ITERATOR(beta_attrs, beta_shift);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
     VectorIterator   gamma_iter = CONVERT_TO_VECTOR_ITERATOR(gamma_attrs, gamma_shift);
+#endif /* USE_DEFAULT_GAMMA */
 
     vec4  unpacked_s[5];
     float denominator;
     float numerator;
-    float gamma_param;
-    float beta_param;
+    float gamma_param = 1.f;
+    float beta_param  = 0.f;
     vec4  x_bar;
     vec4  result;
 
@@ -144,68 +180,87 @@
     unpacked_s[0]      = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
     unpacked_s[1]      = LOAD_UNPACK4_HALF(var_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(var_iter, current_slice * var_attrs.stride_x));
     unpacked_s[2]      = LOAD_UNPACK4_HALF(mean_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(mean_iter, current_slice * mean_attrs.stride_x));
-    unpacked_s[3]      = LOAD_UNPACK4_HALF(gamma_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(gamma_iter, current_slice * beta_attrs.stride_x));
+#ifndef USE_DEFAULT_GAMMA
+    unpacked_s[3]      = LOAD_UNPACK4_HALF(gamma_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(gamma_iter, current_slice * gamma_attrs.stride_x));
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_BETA
     unpacked_s[4]      = LOAD_UNPACK4_HALF(beta_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(beta_iter, current_slice * beta_attrs.stride_x));
+#endif /* USE_DEFAULT_GAMMA */
 
     if((current_slice % uint(4)) == uint(0))
     {
         denominator = unpacked_s[1].x;
         denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
 
-        //Calculate x bar and store results
-        numerator = unpacked_s[2].x;
-        x_bar     = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+        // Calculate x bar
+        numerator   = unpacked_s[2].x;
+        x_bar       = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
 
+#ifndef USE_DEFAULT_GAMMA
         gamma_param = unpacked_s[3].x;
+#endif /* USE_DEFAULT_GAMMA */
+#ifndef USE_DEFAULT_BETA
         beta_param  = unpacked_s[4].x;
-        result      = ACTIVATION_FUNC(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
-
-        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+#endif /* USE_DEFAULT_BETA */
     }
     else if((current_slice % uint(4)) == uint(1))
     {
         denominator = unpacked_s[1].y;
         denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
 
-        //Calculate x bar and store results
-        numerator = unpacked_s[2].y;
-        x_bar     = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+        // Calculate x bar
+        numerator   = unpacked_s[2].y;
+        x_bar       = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
 
+#ifndef USE_DEFAULT_GAMMA
         gamma_param = unpacked_s[3].y;
+#endif /* USE_DEFAULT_GAMMA */
+#ifndef USE_DEFAULT_BETA
         beta_param  = unpacked_s[4].y;
-        result      = ACTIVATION_FUNC(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
-
-        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+#endif /* USE_DEFAULT_BETA */
     }
     else if((current_slice % uint(4)) == uint(2))
     {
         denominator = unpacked_s[1].z;
         denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
 
-        //Calculate x bar and store results
-        numerator = unpacked_s[2].z;
-        x_bar     = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+        // Calculate x bar
+        numerator   = unpacked_s[2].z;
+        x_bar       = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
 
+#ifndef USE_DEFAULT_GAMMA
         gamma_param = unpacked_s[3].z;
+#endif /* USE_DEFAULT_GAMMA */
+#ifndef USE_DEFAULT_BETA
         beta_param  = unpacked_s[4].z;
-        result      = ACTIVATION_FUNC(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
-
-        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+#endif /* USE_DEFAULT_BETA */
     }
     else
     {
         denominator = unpacked_s[1].w;
         denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
 
-        //Calculate x bar and store results
-        numerator = unpacked_s[2].w;
-        x_bar     = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+        // Calculate x bar
+        numerator   = unpacked_s[2].w;
+        x_bar       = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
 
+#ifndef USE_DEFAULT_GAMMA
         gamma_param = unpacked_s[3].w;
+#endif /* USE_DEFAULT_GAMMA */
+#ifndef USE_DEFAULT_BETA
         beta_param  = unpacked_s[4].w;
-        result      = ACTIVATION_FUNC(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
-
-        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+#endif /* USE_DEFAULT_BETA */
     }
+
+#ifndef USE_DEFAULT_GAMMA
+    x_bar = MUL_OP(gamma_param, x_bar);
+#endif /* USE_DEFAULT_GAMMA */
+#ifndef USE_DEFAULT_BETA
+    x_bar = ADD_OP(x_bar, beta_param);
+#endif /* USE_DEFAULT_BETA */
+
+    result = ACTIVATION_FUNC(x_bar);
+
+    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
 }
 #endif /*DATA_TYPE_FP16*/

diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
index 774173d..5e7609c 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs

@@ -62,7 +62,56 @@
     uint total_filters;
 };
 
-#if defined(DATA_TYPE_FP16)
+#if defined(DATA_TYPE_FP32)
+
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+#ifdef HAS_BIAS
+TENSOR_DECLARATION(3, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
+#endif /* BIAS */
+
+void main()
+{
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    ImageIterator    dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
+#ifdef HAS_BIAS
+    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
+#endif /* BIAS */
+
+    bool is_last_thread = (((int(gl_GlobalInvocationID.x)) == (int(gl_NumWorkGroups.x * gl_WorkGroupSize.x) - 1)) && ((int(gl_GlobalInvocationID.y)) == (int(gl_NumWorkGroups.y * gl_WorkGroupSize.y) - 1))
+                           && ((int(gl_GlobalInvocationID.z)) == (int(gl_NumWorkGroups.z * gl_WorkGroupSize.z) - 1)));
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, ((uint(gl_GlobalInvocationID.x) * uint(dst_attrs.stride_y)) + (uint(gl_GlobalInvocationID.y) * uint(width) * uint(dst_attrs.stride_y)) + (uint(
+                                                    gl_GlobalInvocationID.z)
+                                                * uint(width) * uint(height) * uint(dst_attrs.stride_y))));
+    // Linearize convolution elements
+    if(is_last_thread)
+    {
+        for(uint i = 0u; i < uint(total_filters); ++i)
+        {
+            float s0 = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+            STORE_CURRENT_ITEM(dst_ptr, dst_iter, s0);
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
+#ifdef HAS_BIAS
+            float b = LOAD_CURRENT_ITEM(biases_ptr, biases_iter);
+            STORE(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_y), b);
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(biases_iter, biases_attrs.stride_x);
+#endif /* HAS_BIAS */
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_x);
+        }
+    }
+    else
+    {
+        for(uint i = 0u; i < uint(total_filters); ++i)
+        {
+            float s0 = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+            STORE_CURRENT_ITEM(dst_ptr, dst_iter, s0);
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_x);
+        }
+    }
+}
+
+#elif defined(DATA_TYPE_FP16)
 
 TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
 TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
@@ -72,10 +121,10 @@
 
 void main()
 {
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    ImageIterator    dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
+    Tensor3DIterator src_iter    = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    ImageIterator    dst_iter    = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
 #ifdef HAS_BIAS
-    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
+    VectorIterator   biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
 #endif /* BIAS */
 
     bool is_last_thread = (((int(gl_GlobalInvocationID.x)) == (int(gl_NumWorkGroups.x * gl_WorkGroupSize.x) - 1)) && ((int(gl_GlobalInvocationID.y)) == (int(gl_NumWorkGroups.y * gl_WorkGroupSize.y) - 1))
@@ -151,7 +200,7 @@
     }
 }
 
-#endif /* DATA_TYPE_FP16 */
+#endif /* DATA_TYPE_FP32 */
 #endif // RESHAPE_TO_COLUMNS
 
 #ifdef IM2COL_GENERIC
@@ -164,6 +213,7 @@
  * @note STRIDE_X/STRIDE_Y must be passed for stride info, e.g. "#define STRIDE_X xxx"
  * @note CONVOLVED_WIDTH/CONVOLVED_HEIGHT must be passed for convolved dimension, e.g. "#define CONVOLVED_WIDTH xxx"
  * @note SRC_WIDTH/SRC_HEIGHT must be passed for input dimension, e.g. "#define SRC_WIDTH xxx"
+ * @note DILATION_X/DILATION_Y must be passed for dilation sizes, e.g. "#define DILATION_X xxx"
  * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
  *
  * @param[in]  src_ptr      Pointer to the source tensor. Supported data types: F16/F32
@@ -192,30 +242,31 @@
     Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
     ImageIterator    dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
 
-    uint xc    = gl_GlobalInvocationID.x;                // x coordinate in the convolved tensor
-    uint yc    = gl_GlobalInvocationID.y;                // y coordinate in the convolved tensor
-    uint ch    = gl_GlobalInvocationID.z % KERNEL_DEPTH; // input feature map
-    uint batch = gl_GlobalInvocationID.z / KERNEL_DEPTH; // the batch
+    int xc    = int(gl_GlobalInvocationID.x);                // x coordinate in the convolved tensor
+    int yc    = int(gl_GlobalInvocationID.y);                // y coordinate in the convolved tensor
+    int ch    = int(gl_GlobalInvocationID.z) % KERNEL_DEPTH; // input feature map
+    int batch = int(gl_GlobalInvocationID.z) / KERNEL_DEPTH; // the batch
 
     // Calculate input indeces
-    uint xi = xc * uint(STRIDE_X) - uint(PAD_LEFT);
-    uint yi = yc * uint(STRIDE_Y) - uint(PAD_TOP);
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (ch * src_attrs.stride_z) + (batch * src_stride_w));
+    int xi = xc * STRIDE_X - PAD_LEFT;
+    int yi = yc * STRIDE_Y - PAD_TOP;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (ch * int(src_attrs.stride_z)) + (batch * int(src_stride_w)));
 
     // Calculate output indeces
-    uint xo = ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT);
-    uint yo = xc + yc * uint(CONVOLVED_WIDTH); // Index of the convolution
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (yo * dst_attrs.stride_y) + (batch * dst_stride_w) + xo);
+    int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+    int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+    // sizeof is not available in GLES, so we'll use stride_x
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (yo * int(dst_attrs.stride_y)) + (batch * int(dst_stride_w)) + xo * int(dst_attrs.stride_x));
 
     uint src_pos = 0u;
 
     // Linearize convolution elements
-    for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y)
+    for(int y = yi, y_e = yi + KERNEL_HEIGHT * DILATION_Y; y < y_e; y += DILATION_Y)
     {
-        for(uint x = xi, x_e = xi + uint(KERNEL_WIDTH); x < x_e; ++x, TENSOR_OFFSET_ADVANCE(dst_iter, 1u))
+        for(int x = xi, x_e = xi + KERNEL_WIDTH * DILATION_X; x < x_e; x += DILATION_X, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, int(dst_attrs.stride_x)))
         {
 #if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
-            src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.stride_x + y * src_attrs.stride_y);
+            src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * int(src_attrs.stride_x) + y * int(src_attrs.stride_y));
             STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos));
 #else  /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */
             if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
@@ -224,7 +275,7 @@
             }
             else
             {
-                src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.stride_x + y * src_attrs.stride_y);
+                src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * int(src_attrs.stride_x) + y * int(src_attrs.stride_y));
                 STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos));
             }
 #endif /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */
@@ -232,7 +283,7 @@
     }
 
 #ifdef HAS_BIAS
-    if(ch == (uint(KERNEL_DEPTH) - 1))
+    if(ch == (KERNEL_DEPTH - 1))
     {
         STORE_CURRENT_ITEM(dst_ptr, dst_iter, 1.0f);
     }
@@ -659,6 +710,7 @@
 #endif /* DATA_TYPE_FP32 */
 #endif /* IM2COL_REDUCED */
 
+#ifdef COL2IM
 #ifdef WIDTH_OUTPUT
 
 /** This kernel performs a reshaping of the output of the convolution layer.
@@ -692,10 +744,9 @@
     Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
     uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, pos.x * src_attrs.step_y + pos.y * WIDTH_OUTPUT * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * (src_attrs.stride_z));
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, pos.x * src_attrs.step_y + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ);
 
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter,
-                       LOAD_CURRENT_ITEM(src_ptr, src_iter));
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD_CURRENT_ITEM(src_ptr, src_iter));
 }
 
 #elif defined(DATA_TYPE_FP16)
@@ -735,4 +786,5 @@
 #else /* DATA_TYPE_FP32 */
 #error Data type not supported
 #endif /* DATA_TYPE_FP32 */
+#endif /* WIDTH_OUTPUT */
 #endif /* COL2IM */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs
index adfc126..134cc10 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -108,6 +108,8 @@
     uint z_index = gl_GlobalInvocationID.z;
     TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_attrs.stride_z);
 
+    src_iter.current_offset_in_bytes -= int((z_index - z_index / uint(DEPTH_MULTIPLIER)) * src_attrs.step_z);
+
     vec4 w[3];
     w[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
     w[1] = LOAD_UNPACK4_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
@@ -263,6 +265,8 @@
     uint z_index = gl_GlobalInvocationID.z;
     TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_attrs.stride_z);
 
+    src_iter.current_offset_in_bytes -= int((z_index - z_index / uint(DEPTH_MULTIPLIER)) * src_attrs.step_z);
+
     vec4 w[3];
     w[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
     w[1] = LOAD_UNPACK4_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));

diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
index ea4e9c1..b42c09b 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,10 @@
 
 #include "helpers_cs.h"
 
+#ifdef FUSED_ACTIVATION
+#include "activation_layer_helpers_cs.h"
+#endif /* FUSED_ACTIVATION */
+
 #if defined(DATA_TYPE_FP16)
 precision mediump float;
 #endif // DATA_TYPE_FP16
@@ -99,6 +103,10 @@
     pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
     STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
 }
 
@@ -210,6 +218,10 @@
     pixels += b;
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
 }
 #elif defined(PROCESS_4X_2Y_1Z)
@@ -333,6 +345,11 @@
     pixels[1] += b;
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0] = ACT_OP(pixels[0]);
+    pixels[1] = ACT_OP(pixels[1]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
 }
@@ -470,6 +487,12 @@
     pixels[2] += b;
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0] = ACT_OP(pixels[0]);
+    pixels[1] = ACT_OP(pixels[1]);
+    pixels[2] = ACT_OP(pixels[2]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
@@ -609,6 +632,13 @@
     pixels1[1] += b;
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0]  = ACT_OP(pixels[0]);
+    pixels[1]  = ACT_OP(pixels[1]);
+    pixels1[0] = ACT_OP(pixels1[0]);
+    pixels1[1] = ACT_OP(pixels1[1]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels1[0]);
@@ -745,6 +775,11 @@
         pixels[1] += b;
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+        pixels[0] = ACT_OP(pixels[0]);
+        pixels[1] = ACT_OP(pixels[1]);
+#endif /* FUSED_ACTIVATION */
+
         STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
         STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
 
@@ -868,6 +903,11 @@
     pixels[1] += b;
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0] = ACT_OP(pixels[0]);
+    pixels[1] = ACT_OP(pixels[1]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
 }
 #elif defined(PROCESS_8X_2Y_1Z)
@@ -1001,6 +1041,13 @@
     pixels1[1] += b;
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0]  = ACT_OP(pixels[0]);
+    pixels[1]  = ACT_OP(pixels[1]);
+    pixels1[0] = ACT_OP(pixels1[0]);
+    pixels1[1] = ACT_OP(pixels1[1]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
     STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels1);
 }

diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
index 855d450..e51cc37 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,10 @@
 
 #include "helpers_cs.h"
 
+#ifdef FUSED_ACTIVATION
+#include "activation_layer_helpers_cs.h"
+#endif /* FUSED_ACTIVATION */
+
 #if defined(DATA_TYPE_FP16)
 precision mediump float;
 #endif // DATA_TYPE_FP16
@@ -114,6 +118,10 @@
     pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
     STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
 }
 
@@ -238,6 +246,11 @@
     pixels[1] += vec4(b);
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0] = ACT_OP(pixels[0]);
+    pixels[1] = ACT_OP(pixels[1]);
+#endif /* FUSED_ACTIVATION */
+
     VSTORE2_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
 }
 
@@ -335,6 +348,10 @@
     pixels += b;
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
     STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
 }
 
@@ -434,6 +451,12 @@
     pixels[2] += vec4(b);
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0] = ACT_OP(pixels[0]);
+    pixels[1] = ACT_OP(pixels[1]);
+    pixels[2] = ACT_OP(pixels[2]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels[0]);
     STORE(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
     STORE(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
@@ -601,6 +624,12 @@
     }
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0] = ACT_OP(pixels[0]);
+    pixels[1] = ACT_OP(pixels[1]);
+    pixels[2] = ACT_OP(pixels[2]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
     STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
     STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
@@ -728,6 +757,10 @@
     pixels += vec4(b);
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
 }
 
@@ -841,6 +874,12 @@
     }
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0] = ACT_OP(pixels[0]);
+    pixels[1] = ACT_OP(pixels[1]);
+    pixels[2] = ACT_OP(pixels[2]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
@@ -962,6 +1001,13 @@
     }
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels[0] = ACT_OP(pixels[0]);
+    pixels[1] = ACT_OP(pixels[1]);
+    pixels[2] = ACT_OP(pixels[2]);
+    pixels[3] = ACT_OP(pixels[3]);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
     STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
@@ -1087,6 +1133,13 @@
         }
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+        pixels[0] = ACT_OP(pixels[0]);
+        pixels[1] = ACT_OP(pixels[1]);
+        pixels[2] = ACT_OP(pixels[2]);
+        pixels[3] = ACT_OP(pixels[3]);
+#endif /* FUSED_ACTIVATION */
+
         STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
         STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
         STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);

diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
index c919e4e..728e964 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,10 @@
 
 #include "helpers_cs.h"
 
+#ifdef FUSED_ACTIVATION
+#include "activation_layer_helpers_cs.h"
+#endif /* FUSED_ACTIVATION */
+
 #if defined(DATA_TYPE_FP16)
 precision mediump float;
 #endif // DATA_TYPE_FP16
@@ -116,6 +120,10 @@
     pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    pixels = ACT_OP(pixels);
+#endif /* FUSED_ACTIVATION */
+
     STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
 }
 #elif defined(DATA_TYPE_FP16)
@@ -204,6 +212,10 @@
     res += vec4(b);
 #endif /* BIAS */
 
+#ifdef FUSED_ACTIVATION
+    res = ACT_OP(res);
+#endif /* FUSED_ACTIVATION */
+
     STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, res);
 }
 

diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
index ba50721..a65f980 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs

@@ -132,7 +132,7 @@
 /** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
  *
  * @param[in]  src0_ptr   Pointer to the source matrix. Supported data types: F32
  * @param[in]  src0_attrs The attributes of the source matrix
@@ -220,7 +220,9 @@
 /** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
  *
  * @param[in]  src0_ptr   Pointer to the source matrix. Supported data types: F32
  * @param[in]  src0_attrs The attributes of the source matrix
@@ -344,6 +346,184 @@
 }
 #endif /* GEMM_MM_FLOATING_POINT */
 
+#ifdef GEMM_MM_FLOATING_POINT_BIFROST
+/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B in case both matrices have not been reshaped
+ *
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ *
+ * @param[in]  src0_ptr   Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_attrs The attributes of the source matrix
+ * @param[in]  src1_ptr   Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_attrs The attributes of the source matrix
+ * @param[out] dst_ptr    Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_attrs  The attributes of the destination matrix
+ */
+SHADER_PARAMS_DECLARATION
+{
+    ImageAttributes src0_attrs;
+    ImageAttributes src1_attrs;
+    ImageAttributes dst_attrs;
+};
+TENSOR_DECLARATION(1, src0Buffer, float, src0_ptr, src0_shift, 2, readonly);
+TENSOR_DECLARATION(2, src1Buffer, float, src1_ptr, src1_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
+void main()
+{
+    ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
+    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
+    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+
+    int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
+    /* Compute the address for the vector A and matrix B */
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * (src0_attrs.stride_y) * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, idx * 4);
+
+    /* Reset accumulators */
+    vec4 acc0 = vec4(0.0f);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    vec4 acc1 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    vec4 acc2 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    vec4 acc3 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    // A and B src indices get incremented at the same time.
+    int i = 0;
+    for(; i <= (COLS_A - 4); i += 4)
+    {
+        // Load values from matrix A and matrix B
+        vec4 a0 = VLOAD4_CURRENT_ITEM(vec4, src0_ptr, src0_iter);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        vec4 a1 = VLOAD4(vec4, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        vec4 a2 = VLOAD4(vec4, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        vec4 a3 = VLOAD4(vec4, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
+
+        // Multiply and accumulate
+        acc0 += b0 * vec4(a0.x);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * vec4(a1.x);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * vec4(a2.x);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * vec4(a3.x);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        // Load values from matrix B
+        b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
+
+        // Multiply and accumulate
+        acc0 += b0 * vec4(a0.y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * vec4(a1.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * vec4(a2.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * vec4(a3.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        // Load values from matrix B
+        b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
+
+        // Multiply and accumulate
+        acc0 += b0 * vec4(a0.z);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * vec4(a1.z);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * vec4(a2.z);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * vec4(a3.z);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        // Load values from matrix B
+        b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
+
+        // Multiply and accumulate
+        acc0 += b0 * vec4(a0.w);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * vec4(a1.w);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * vec4(a2.w);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * vec4(a3.w);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        TENSOR_ITERATOR_ADVANCE(src0_iter, 4);
+    }
+
+    for(; i < COLS_A; ++i)
+    {
+        // Load values from matrix A
+        float a0 = LOAD_CURRENT_ITEM(src0_ptr, src0_iter);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        float a1 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        float a2 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        float a3 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
+
+        // Multiply and accumulate
+        acc0 += b0 * vec4(a0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * vec4(a1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * vec4(a2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * vec4(a3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
+        TENSOR_ITERATOR_ADVANCE(src0_iter, 1);
+    }
+
+    /* Multiply by the weight of vector-matrix product */
+    acc0 = acc0 * vec4(ALPHA);
+    VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = acc1 * vec4(ALPHA);
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = acc2 * vec4(ALPHA);
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = acc3 * vec4(ALPHA);
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+#endif /* GEMM_MM_FLOATING_POINT_BIFROST */
+
 #ifdef GEMM_MATRIXADDITION
 /** This OpenGL ES kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
  *
@@ -461,7 +641,7 @@
 /** This OpenGL ES kernel computes the matrix multiplication between matrix A(src0) and matrix B(src1)
  * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
  *
  * @param[in]  src0_ptr   Pointer to the source matrix.Supported data types: F16
  * @param[in]  src0_attrs The attributes of the source matrix
@@ -836,7 +1016,7 @@
 /** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
  *
  * @param[in]  src0_ptr   Pointer to the source matrix. Supported data types: F16
  * @param[in]  src0_attrs The attributes of the source matrix

diff --git a/src/core/GLES_COMPUTE/gl_entries.in b/src/core/GLES_COMPUTE/gl_entries.in
index 15ce8ee..17e3aee 100644
--- a/src/core/GLES_COMPUTE/gl_entries.in
+++ b/src/core/GLES_COMPUTE/gl_entries.in

@@ -61,3 +61,4 @@
 GL_ENTRY(glDeleteFramebuffers)
 GL_ENTRY(glBindFramebuffer)
 GL_ENTRY(glFramebufferTexture2D)
+GL_ENTRY(glGetString)

diff --git a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
index d7c645d..8287823 100644
--- a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp

@@ -111,8 +111,9 @@
 
     _output->set_needs_shifting(true);
 
-    Window slice    = window.first_slice_window_3D();
-    Window slice_in = window.first_slice_window_3D();
+    Window collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+    Window slice_in  = collapsed.first_slice_window_3D();
 
     slice.shift(Window::DimX, -(_output->info()->padding()).left);
 
@@ -125,10 +126,10 @@
     {
         unsigned int idx     = 0;
         unsigned int binding = 1;
-        add_3D_tensor_argument(idx, _input, binding++, slice_in);
-        add_3D_tensor_argument(idx, _output, binding++, slice);
+        add_3D_tensor_argument(idx, _input, binding++, slice);
+        add_3D_tensor_argument(idx, _output, binding++, slice_in);
         _kernel.update_shader_params();
         enqueue(*this, slice);
     }
-    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
+    while(collapsed.slide_window_slice_3D(slice) && collapsed.slide_window_slice_3D(slice_in));
 }

diff --git a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
index cd93f69..9a592df 100644
--- a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp

@@ -36,6 +36,105 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const ITensorInfo *mean, const ITensorInfo *var,
+                          const ITensorInfo *beta, const ITensorInfo *gamma,
+                          float epsilon, ActivationLayerInfo act_info)
+{
+    ARM_COMPUTE_UNUSED(epsilon);
+    ARM_COMPUTE_UNUSED(var);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var);
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    if(beta != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, beta);
+    }
+    if(gamma != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, gamma);
+    }
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_ERROR_ON(input->data_type() != DataType::F32 && input->data_type() != DataType::F16);
+        ARM_COMPUTE_ERROR_ON(act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
+                             && act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
+                             && act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+        ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
+    }
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+                                                        ITensorInfo *mean, ITensorInfo *var,
+                                                        ITensorInfo *beta, ITensorInfo *gamma)
+{
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type(), input->fixed_point_position());
+
+    unsigned int num_elems_processed_per_iteration = 1;
+    if(input->data_type() == DataType::F16)
+    {
+        num_elems_processed_per_iteration = 4;
+    }
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     mean_access(mean, 0, 0, mean->dimension(0) + 3, mean->dimension(1));
+    AccessWindowStatic     var_access(var, 0, 0, var->dimension(0) + 3, var->dimension(1));
+
+    bool window_changed = false;
+    if(beta != nullptr)
+    {
+        AccessWindowStatic beta_access(beta, 0, 0, beta->dimension(0) + 3, beta->dimension(1));
+        if(gamma != nullptr)
+        {
+            AccessWindowStatic gamma_access(gamma, 0, 0, gamma->dimension(0) + 3, gamma->dimension(1));
+            window_changed = update_window_and_padding(win, input_access, output_access, mean_access, var_access, beta_access, gamma_access);
+        }
+        else
+        {
+            window_changed = update_window_and_padding(win, input_access, output_access, mean_access, var_access, beta_access);
+        }
+    }
+    else
+    {
+        if(gamma != nullptr)
+        {
+            AccessWindowStatic gamma_access(gamma, 0, 0, gamma->dimension(0) + 3, gamma->dimension(1));
+            window_changed = update_window_and_padding(win, input_access, output_access, mean_access, var_access, gamma_access);
+        }
+        else
+        {
+            window_changed = update_window_and_padding(win, input_access, output_access, mean_access, var_access);
+        }
+    }
+    output_access.set_valid_region(win, input->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 GCBatchNormalizationLayerKernel::GCBatchNormalizationLayerKernel()
     : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0.0f)
 {
@@ -44,24 +143,11 @@
 void GCBatchNormalizationLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma,
                                                 float epsilon, ActivationLayerInfo act_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, var);
 
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, mean, var, beta, gamma);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, mean, var, beta, gamma);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_ERROR_ON(input->info()->data_type() != DataType::F32 && input->info()->data_type() != DataType::F16);
-        ARM_COMPUTE_ERROR_ON(act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
-                             && act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
-                             && act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
-        ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
-    }
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mean->info(), var->info(),
+                                                  (beta != nullptr) ? beta->info() : nullptr, (gamma != nullptr) ? gamma->info() : nullptr,
+                                                  epsilon, act_info));
 
     _input   = input;
     _output  = output;
@@ -71,12 +157,6 @@
     _gamma   = gamma;
     _epsilon = epsilon;
 
-    unsigned int num_elems_processed_per_iteration = 1;
-    if(input->info()->data_type() == DataType::F16)
-    {
-        num_elems_processed_per_iteration = 4;
-    }
-
     // Set build options
     std::set<std::string> build_opts;
     std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
@@ -85,6 +165,14 @@
     build_opts.emplace(("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)));
     build_opts.emplace(("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)));
     build_opts.emplace(("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)));
+    if(beta == nullptr)
+    {
+        build_opts.emplace("#define USE_DEFAULT_BETA");
+    }
+    if(gamma == nullptr)
+    {
+        build_opts.emplace("#define USE_DEFAULT_GAMMA");
+    }
 
     if(act_info.enabled())
     {
@@ -97,19 +185,25 @@
     _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("batchnormalization_layer", build_opts));
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    auto win_config = validate_and_configure_window(input->info(), output->info(), mean->info(), var->info(),
+                                                    (beta != nullptr) ? beta->info() : nullptr, (gamma != nullptr) ? gamma->info() : nullptr);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     mean_access(mean->info(), 0, 0, mean->info()->dimension(0) + 3, mean->info()->dimension(1));
-    AccessWindowStatic     var_access(var->info(), 0, 0, var->info()->dimension(0) + 3, var->info()->dimension(1));
-    AccessWindowStatic     beta_access(beta->info(), 0, 0, beta->info()->dimension(0) + 3, beta->info()->dimension(1));
-    AccessWindowStatic     gamma_access(gamma->info(), 0, 0, gamma->info()->dimension(0) + 3, gamma->info()->dimension(1));
+    IGCKernel::configure(win_config.second);
+}
 
-    update_window_and_padding(win, input_access, output_access, mean_access, var_access, beta_access, gamma_access);
-    output_access.set_valid_region(win, input->info()->valid_region());
+Status GCBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                                 const ITensorInfo *mean, const ITensorInfo *var,
+                                                 const ITensorInfo *beta, const ITensorInfo *gamma,
+                                                 float epsilon, ActivationLayerInfo act_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
+                                                              mean->clone().get(), var->clone().get(),
+                                                              beta->clone().get(), gamma->clone().get())
+                                .first);
 
-    IGCKernel::configure(win);
+    return Status{};
 }
 
 void GCBatchNormalizationLayerKernel::run(const Window &window)
@@ -127,11 +221,18 @@
     Window vector_slice = window.first_slice_window_1D();
     vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
 
-    unsigned int idx = 2 * num_arguments_per_3D_tensor();
-    add_1D_tensor_argument(idx, _mean, 3, vector_slice);
-    add_1D_tensor_argument(idx, _var, 4, vector_slice);
-    add_1D_tensor_argument(idx, _beta, 5, vector_slice);
-    add_1D_tensor_argument(idx, _gamma, 6, vector_slice);
+    unsigned int idx           = 2 * num_arguments_per_3D_tensor();
+    unsigned int binding_point = 3;
+    add_1D_tensor_argument(idx, _mean, binding_point, vector_slice);
+    add_1D_tensor_argument(idx, _var, ++binding_point, vector_slice);
+    if(_beta != nullptr)
+    {
+        add_1D_tensor_argument(idx, _beta, ++binding_point, vector_slice);
+    }
+    if(_gamma != nullptr)
+    {
+        add_1D_tensor_argument(idx, _gamma, ++binding_point, vector_slice);
+    }
 
     slice.shift(Window::DimX, -(_output->info()->padding()).left);
 

diff --git a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
index af1e34e..1554a89 100644
--- a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp

@@ -62,30 +62,32 @@
     _output         = output;
     _convolved_dims = convolved_dims;
 
-    unsigned int num_elems_processed_per_iteration = 1;
+    const DataType     dt         = input->info()->data_type();
+    const unsigned int local_size = 1;
 
     // Create kernel
     std::set<std::string> build_opts;
+    build_opts.emplace("#define COL2IM ");
     build_opts.emplace("#define WIDTH_OUTPUT " + support::cpp11::to_string(_convolved_dims.first));
-    std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    const std::string dt_name = (dt == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
     build_opts.emplace(("#define " + dt_name));
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(local_size));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(local_size));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(local_size));
 
     _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("col2im", build_opts));
 
     // Configure window
-    unsigned int nums = 2;
-    Window       win  = calculate_max_window(*output->info(), Steps(nums));
+    const unsigned int num_elems_processed_per_iteration = (dt == DataType::F32) ? 1 : 2;
 
-    AccessWindowHorizontal output_access(output->info(), 0, 2);
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
     const int              input_padding = ceil_to_multiple(input->info()->dimension(0), 2) - input->info()->dimension(0);
 
     AccessWindowStatic input_access(input->info(), 0, 0, input->info()->dimension(0) + input_padding, input->info()->dimension(1) + 1);
 
-    update_window_and_padding(win, input_access,
-                              output_access);
+    update_window_and_padding(win, input_access, output_access);
 
     output_access.set_valid_region(win, output->info()->valid_region());
 

diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
index 9343268..c237409 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp

@@ -33,31 +33,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 using namespace arm_compute;
-
-namespace
-{
-/** Calculates expected output shape dimension
- *
- * @param[in] Input shape
- *
- * @return Expected output shape
- */
-TensorShape get_output_shape(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info)
-{
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
-
-    std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
-
-    TensorShape output_shape = input_shape;
-    output_shape.set(0, output_width);
-    output_shape.set(1, output_height);
-
-    return output_shape;
-}
-} // namespace
+using namespace arm_compute::misc::shape_calculator;
 
 GCDepthwiseConvolutionLayer3x3Kernel::GCDepthwiseConvolutionLayer3x3Kernel()
     : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_left(0), _conv_pad_top(0), _lws(gles::NDRange(1U, 1U, 1U))
@@ -69,7 +48,8 @@
     return _border_size;
 }
 
-void GCDepthwiseConvolutionLayer3x3Kernel::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
+void GCDepthwiseConvolutionLayer3x3Kernel::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
+                                                     unsigned int depth_multiplier)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -83,7 +63,7 @@
     }
 
     // Get convolved dimensions
-    TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);
+    const TensorShape output_shape = compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(),
@@ -93,6 +73,7 @@
                        input->info()->fixed_point_position());
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(2) != weights->info()->dimension(2));
 
     _input         = input;
     _output        = output;
@@ -108,6 +89,7 @@
     ARM_COMPUTE_ERROR_ON(_conv_stride_x < 1 || _conv_stride_x > 3);
     std::set<std::string> options;
 
+    options.emplace("#define DEPTH_MULTIPLIER " + support::cpp11::to_string(depth_multiplier));
     options.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0]));
     options.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1]));
     options.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2]));

diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index 99b5e7d..6b16def 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp

@@ -50,7 +50,8 @@
 }
 
 template <unsigned int kernel_size>
-void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output, const PadStrideInfo &conv_info)
+void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output,
+                                                            const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
@@ -58,6 +59,7 @@
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
     ARM_COMPUTE_ERROR_ON_MSG((kernel_size == 3 && std::get<0>(conv_info.stride()) > 2), "Strides larger than 2 not supported in 3x3 direct convolution!");
     ARM_COMPUTE_ERROR_ON(kernel_size != weights->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(act_info.enabled() && act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU && act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC);
 
     if(bias != nullptr)
     {
@@ -106,6 +108,16 @@
     std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
     options.emplace(("#define " + dt_name));
 
+    // Activation information in case of a fused activation
+    if(act_info.enabled())
+    {
+        options.emplace("#define FUSED_ACTIVATION");
+        options.emplace(("#define " + string_from_activation_func(act_info.activation())));
+        options.emplace(("#define ACT_OP  " + lower_string(string_from_activation_func(act_info.activation())) + "_op"));
+        options.emplace(("#define A_VAL " + float_to_string_with_full_precision(act_info.a())));
+        options.emplace(("#define B_VAL " + float_to_string_with_full_precision(act_info.b())));
+    }
+
     unsigned int num_elems_read_per_iteration_x    = kernel_size * _conv_stride_x;
     unsigned int num_elems_read_per_iteration_y    = 1;
     unsigned int num_elems_written_per_iteration_x = 1;

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
index dc86bfb..171fbad 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp

@@ -35,7 +35,6 @@
 #include "arm_compute/core/Window.h"
 
 using namespace arm_compute;
-using namespace arm_compute::gles_compute;
 
 GCGEMMInterleave4x4Kernel::GCGEMMInterleave4x4Kernel()
     : _input(nullptr), _output(nullptr)

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
index 43846dc..1a68a62 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,6 @@
 #include "arm_compute/core/Window.h"
 
 using namespace arm_compute;
-using namespace arm_compute::gles_compute;
 
 GCGEMMMatrixAdditionKernel::GCGEMMMatrixAdditionKernel()
     : _input(nullptr), _output(nullptr)

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
index a5f09e8..d576c30 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,38 +31,182 @@
 #include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
 #include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <set>
 #include <string>
 
 using namespace arm_compute;
-using namespace arm_compute::gles_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+using ElementsProcessed = Steps;
+
+inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
+{
+    ARM_COMPUTE_UNUSED(reshape_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
+
+    if(!is_interleaved_transposed)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+
+        if(output->total_size() != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));
+            ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+        }
+    }
+    else
+    {
+        const int m                         = reshape_info.m();
+        const int n                         = reshape_info.n();
+        const int k                         = reshape_info.k();
+        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
+        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+
+        TensorShape tensor_shape0{ input0->tensor_shape() };
+        tensor_shape0.set(0, k);
+        tensor_shape0.set(1, m);
+
+        TensorShape tensor_shape1{ input1->tensor_shape() };
+        tensor_shape1.set(0, n);
+        tensor_shape1.set(1, k);
+
+        const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
+        const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
+
+        const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
+        const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+        if(output->total_size() != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n));
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
+        }
+    }
+
+    return Status{};
+}
+
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output,
+                                                               bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
+                                                               GPUTarget gpu_target, ElementsProcessed &num_elements_processed)
+{
+    ARM_COMPUTE_UNUSED(gpu_target);
+
+    // Output tensor auto inizialitation if not yet initialized
+    TensorShape tensor_shape{ input0->tensor_shape() };
+    tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->dimension(0));
+    tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->dimension(1));
+
+    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(tensor_shape));
+
+    bool   window_changed = false;
+    Window win{};
+
+    const DataType data_type                           = input0->data_type();
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+
+    if(is_interleaved_transposed)
+    {
+        // Configure window kernel
+        num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(data_type);
+        num_elems_processed_per_iteration_y = 4;
+
+        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
+        AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+    else // The input tensors have not been reshaped
+    {
+        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor.
+        num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
+
+        switch(data_type)
+        {
+            case DataType::F16:
+                num_elems_processed_per_iteration_x = 4;
+                break;
+
+            case DataType::F32:
+                num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(data_type);
+                break;
+
+            default:
+                ARM_COMPUTE_ERROR("Current data type is not supported");
+                break;
+        }
+
+        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowStatic    input0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y));
+        AccessWindowStatic    input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        Coordinates coord;
+        coord.set_num_dimensions(output->num_dimensions());
+        output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
 
 GCGEMMMatrixMultiplyKernel::GCGEMMMatrixMultiplyKernel()
     : _input0(nullptr), _input1(nullptr), _output(nullptr)
 {
 }
 
-void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed)
+void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
 
-    if(!is_interleaved_transposed)
-    {
-        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
-    }
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
 
     _input0 = input0;
     _input1 = input1;
     _output = output;
 
+    // Get target architecture
+    GPUTarget gpu_target = get_target();
+
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, gpu_target, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    IGCKernel::configure(win_config.second);
+
+    // Create build options
     std::set<std::string> build_opts;
-    Window                win;
+    std::string           kernel_name;
 
     build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
     build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
@@ -74,6 +218,12 @@
     // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
     if(is_interleaved_transposed)
     {
+        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
+        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+
+        build_opts.emplace("#define MULT_TRANSPOSE1XW_WIDTH " + support::cpp11::to_string(mult_transpose1xW_width));
+        build_opts.emplace("#define MULT_INTERLEAVE4X4_HEIGHT " + support::cpp11::to_string(mult_interleave4x4_height));
+
         switch(input0->info()->data_type())
         {
             case DataType::F16:
@@ -91,57 +241,32 @@
 
         build_opts.emplace("#define GEMM_MM_INTERLEAVED_TRANSPOSED");
 
-        // Create kernel
-        _kernel = GCKernelLibrary::get().create_kernel(("gemm_mm_interleaved_transposed"), build_opts);
-
-        // Configure window kernel
-        const unsigned int     num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input0->info()->data_type());
-        constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-
-        win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-        AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
-        AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
-        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
-        update_window_and_padding(win, input0_access, input1_access, output_access);
-
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+        kernel_name = "gemm_mm_interleaved_transposed";
     }
     else
     {
-        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
-
         // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor
-        unsigned int num_elems_processed_per_iteration_x;
-        unsigned int num_elems_processed_per_iteration_y;
 
+        GPUTarget arch_target = get_arch_from_target(gpu_target);
         switch(input0->info()->data_type())
         {
             case DataType::F16:
                 build_opts.emplace("#define DATA_TYPE_FP16");
-
-#define MM_PROCESS_4X_OPTIMIZED
-
-#if defined(MM_PROCESS_4X)
-                num_elems_processed_per_iteration_x = 4;
-                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
-                build_opts.emplace("#define MM_PROCESS_4X");
-#elif defined(MM_PROCESS_4X_OPTIMIZED) /* MM_PROCESS_4X */
-                num_elems_processed_per_iteration_x = 4;
-                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
                 build_opts.emplace("#define MM_PROCESS_4X_OPTIMIZED");
-#elif defined(MM_PROCESS_8X)           /* MM_PROCESS_4X */
-                num_elems_processed_per_iteration_x = 8;
-                num_elems_processed_per_iteration_y = 1;
-                build_opts.emplace("#define MM_PROCESS_8X");
-#endif                                 /* MM_PROCESS_4X */
+                build_opts.emplace("#define GEMM_MM_FLOATING_POINT");
                 break;
 
             case DataType::F32:
-                num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input0->info()->data_type());
-                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
                 build_opts.emplace("#define DATA_TYPE_FP32");
+
+                if(arch_target == GPUTarget::BIFROST && input0->info()->num_dimensions() != 1)
+                {
+                    build_opts.emplace("#define GEMM_MM_FLOATING_POINT_BIFROST");
+                }
+                else
+                {
+                    build_opts.emplace("#define GEMM_MM_FLOATING_POINT");
+                }
                 break;
 
             default:
@@ -149,32 +274,31 @@
                 break;
         }
 
-        build_opts.emplace("#define GEMM_MM_FLOATING_POINT");
-        build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_X " + support::cpp11::to_string(num_elems_processed_per_iteration_x));
-        build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_Y " + support::cpp11::to_string(num_elems_processed_per_iteration_y));
+        build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_X " + support::cpp11::to_string(num_elements_processed.x()));
+        build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_Y " + support::cpp11::to_string(num_elements_processed.y()));
 
-        // Create kernel
-        _kernel = GCKernelLibrary::get().create_kernel("gemm_mm_floating_point", build_opts);
-
-        win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-#if defined(MM_PROCESS_4X_OPTIMIZED)
-        AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), 8), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y));
-#else  /* MM_PROCESS_4X_OPTIMIZED */
-        AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), num_elems_processed_per_iteration_x), ceil_to_multiple(input0->info()->dimension(1),
-                                         num_elems_processed_per_iteration_y));
-#endif /* MM_PROCESS_4X_OPTIMIZED */
-        AccessWindowStatic    input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1));
-        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
-        update_window_and_padding(win, input0_access, input1_access, output_access);
-
-        Coordinates coord;
-        coord.set_num_dimensions(output->info()->num_dimensions());
-        output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
+        kernel_name = "gemm_mm_floating_point";
     }
 
-    IGCKernel::configure(win);
+    // Create kernel
+    _kernel = GCKernelLibrary::get().create_kernel(kernel_name, build_opts);
+}
+
+Status GCGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed,
+                                            const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ElementsProcessed num_elements_processed{};
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
+                                                              input1->clone().get(),
+                                                              output->clone().get(),
+                                                              is_interleaved_transposed,
+                                                              reshape_info,
+                                                              gpu_target,
+                                                              num_elements_processed)
+                                .first);
+    return Status{};
 }
 
 void GCGEMMMatrixMultiplyKernel::run(const Window &window)

diff --git a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
index 4ab6f3e..6c89616 100644
--- a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp

@@ -32,6 +32,7 @@
 #include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "support/ToolchainSupport.h"
@@ -64,7 +65,7 @@
 {
 }
 
-void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -97,7 +98,8 @@
                                      && (std::equal(input->info()->tensor_shape().cbegin() + 3,
                                                     input->info()->tensor_shape().cend(),
                                                     output->info()->tensor_shape().cbegin() + 1))
-                                     && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding());
+                                     && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding())
+                                     && (dilation == Size2D(1U, 1U));
 
     std::string kernel_name = "im2col_generic";
     if(!run_img2col_reduced)
@@ -110,8 +112,8 @@
         build_opts.emplace("#define IM2COL_GENERIC");
         _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
                                             kernel_dims.width, kernel_dims.height,
-                                            conv_info);
-        _num_elems_processed_per_iteration = 2;
+                                            conv_info, dilation);
+        _num_elems_processed_per_iteration = (input->info()->data_type() == DataType::F32) ? 1 : 2;
 
         build_opts.emplace("#define KERNEL_WIDTH " + support::cpp11::to_string(kernel_dims.width));
         build_opts.emplace("#define KERNEL_HEIGHT " + support::cpp11::to_string(kernel_dims.height));
@@ -126,6 +128,8 @@
         build_opts.emplace("#define PAD_BOTTOM " + support::cpp11::to_string(conv_info.pad_bottom()));
         build_opts.emplace("#define SRC_WIDTH " + support::cpp11::to_string(input->info()->dimension(0)));
         build_opts.emplace("#define SRC_HEIGHT " + support::cpp11::to_string(input->info()->dimension(1)));
+        build_opts.emplace("#define DILATION_X " + support::cpp11::to_string(dilation.x()));
+        build_opts.emplace("#define DILATION_Y " + support::cpp11::to_string(dilation.y()));
 
         _run_func = &GCIm2ColKernel::run_generic;
     }
@@ -205,11 +209,12 @@
     IGCKernel::configure(win);
 }
 
-Status GCIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+Status GCIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
 {
     ARM_COMPUTE_UNUSED(kernel_dims);
     ARM_COMPUTE_UNUSED(conv_info);
     ARM_COMPUTE_UNUSED(has_bias);
+    ARM_COMPUTE_UNUSED(dilation);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
     return Status{};
 }

diff --git a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
index 46d7ff9..f87615a 100644
--- a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp

@@ -51,7 +51,6 @@
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON(output == input);
     ARM_COMPUTE_ERROR_ON(policy != InterpolationPolicy::NEAREST_NEIGHBOR);
-    ARM_COMPUTE_UNUSED(sampling_policy);
 
     _input  = input;
     _output = output;
@@ -123,7 +122,7 @@
     output_access.set_valid_region(win, calculate_valid_region_scale(*(input->info()),
                                                                      output->info()->tensor_shape(),
                                                                      policy,
-                                                                     border,
+                                                                     sampling_policy,
                                                                      border_undefined));
 
     IGCKernel::configure(win);

diff --git a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
index 21946b7..f0057df 100644
--- a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp

@@ -36,7 +36,6 @@
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
-using namespace arm_compute::gles_compute;
 
 GCTensorShiftKernel::GCTensorShiftKernel()
     : _input(nullptr), _lws(gles::NDRange(1U, 1U, 1U)), _left_padding(0)

diff --git a/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp
index 4c08873..ccbfaf8 100644
--- a/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp

@@ -31,11 +31,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 
 using namespace arm_compute;
-using namespace arm_compute::gles_compute;
+using namespace arm_compute::misc::shape_calculator;
 
 GCWeightsReshapeKernel::GCWeightsReshapeKernel()
     : _input(nullptr), _biases(nullptr), _output(nullptr)
@@ -47,15 +48,8 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
-    // Calculate output shape
-    TensorShape output_shape{ input->info()->tensor_shape() };
-    output_shape.collapse(3);
-    const size_t tmp_dim = output_shape[0];
-    output_shape.set(0, output_shape[1]);
-    output_shape.set(1, tmp_dim + (biases != nullptr ? 1 : 0));
-
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_weights_reshaped_shape(*input->info(), (biases != nullptr))));
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 

diff --git a/src/core/GPUTarget.cpp b/src/core/GPUTarget.cpp
new file mode 100644
index 0000000..575d858
--- /dev/null
+++ b/src/core/GPUTarget.cpp

@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/Log.h"
+
+#include <map>
+#include <regex>
+
+namespace
+{
+arm_compute::GPUTarget get_bifrost_target(const std::string &version)
+{
+    if(version == "G71")
+    {
+        return arm_compute::GPUTarget::G71;
+    }
+    else if(version == "G72")
+    {
+        return arm_compute::GPUTarget::G72;
+    }
+    else if(version == "G51")
+    {
+        return arm_compute::GPUTarget::G51;
+    }
+    else if(version == "G51BIG")
+    {
+        return arm_compute::GPUTarget::G51BIG;
+    }
+    else if(version == "G51LIT")
+    {
+        return arm_compute::GPUTarget::G51LIT;
+    }
+    else if(version == "TNOX")
+    {
+        return arm_compute::GPUTarget::TNOX;
+    }
+    else if(version == "TTRX")
+    {
+        return arm_compute::GPUTarget::TTRX;
+    }
+    else if(version == "TBOX")
+    {
+        return arm_compute::GPUTarget::TBOX;
+    }
+    else
+    {
+        return arm_compute::GPUTarget::BIFROST;
+    }
+}
+
+arm_compute::GPUTarget get_midgard_target(const std::string &version)
+{
+    if(version == "T600")
+    {
+        return arm_compute::GPUTarget::T600;
+    }
+    else if(version == "T700")
+    {
+        return arm_compute::GPUTarget::T700;
+    }
+    else if(version == "T800")
+    {
+        return arm_compute::GPUTarget::T800;
+    }
+    else
+    {
+        return arm_compute::GPUTarget::MIDGARD;
+    }
+}
+} // namespace
+
+namespace arm_compute
+{
+const std::string &string_from_target(GPUTarget target)
+{
+    static std::map<GPUTarget, const std::string> gpu_target_map =
+    {
+        { GPUTarget::MIDGARD, "midgard" },
+        { GPUTarget::BIFROST, "bifrost" },
+        { GPUTarget::T600, "t600" },
+        { GPUTarget::T700, "t700" },
+        { GPUTarget::T800, "t800" },
+        { GPUTarget::G71, "g71" },
+        { GPUTarget::G72, "g72" },
+        { GPUTarget::G51, "g51" },
+        { GPUTarget::G51BIG, "g51big" },
+        { GPUTarget::G51LIT, "g51lit" },
+        { GPUTarget::TNOX, "tnox" },
+        { GPUTarget::TTRX, "ttrx" },
+        { GPUTarget::TBOX, "tbox" }
+    };
+
+    return gpu_target_map[target];
+}
+
+GPUTarget get_target_from_name(const std::string &device_name)
+{
+    std::regex  mali_regex(R"(Mali-(.*))");
+    std::smatch name_parts;
+    const bool  found_mali = std::regex_search(device_name, name_parts, mali_regex);
+
+    if(!found_mali)
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Can't find valid Mali GPU. Target is set to UNKNOWN.");
+        return GPUTarget::UNKNOWN;
+    }
+
+    const char         target  = name_parts.str(1)[0];
+    const std::string &version = name_parts.str(1);
+
+    std::regex future_regex(R"(.*X)");
+    const bool is_future_bifrost = std::regex_search(version, future_regex);
+
+    if(target == 'G' || is_future_bifrost)
+    {
+        return get_bifrost_target(version);
+    }
+    else if(target == 'T')
+    {
+        return get_midgard_target(version);
+    }
+    else
+    {
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Mali GPU unknown. Target is set to the default one. (BIFROST)");
+        return GPUTarget::BIFROST;
+    }
+}
+
+GPUTarget get_arch_from_target(GPUTarget target)
+{
+    return (target & GPUTarget::GPU_ARCH_MASK);
+}
+} // namespace arm_compute

diff --git a/src/core/HOGInfo.cpp b/src/core/HOGInfo.cpp
index 73f4c42..4f99455 100644
--- a/src/core/HOGInfo.cpp
+++ b/src/core/HOGInfo.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,7 +61,7 @@
     _phase_type            = phase_type;
 
     // Compute descriptor size. +1 takes into account of the bias
-    _descriptor_size = num_cells_per_block().area() * num_blocks_per_image(_detection_window_size).area() * _num_bins + 1;
+    _descriptor_size = num_cells_per_block().area() * num_block_positions_per_image(_detection_window_size).area() * _num_bins + 1;
 }
 
 Size2D HOGInfo::num_cells_per_block() const
@@ -80,8 +80,10 @@
                   _block_stride.height / _cell_size.height);
 }
 
-Size2D HOGInfo::num_blocks_per_image(const Size2D &image_size) const
+Size2D HOGInfo::num_block_positions_per_image(const Size2D &image_size) const
 {
+    ARM_COMPUTE_ERROR_ON(_block_stride.width == 0 || _block_stride.height == 0);
+
     return Size2D(((image_size.width - _block_size.width) / _block_stride.width) + 1,
                   ((image_size.height - _block_size.height) / _block_stride.height) + 1);
 }

diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
index 3ee0fa7..e336331 100644
--- a/src/core/Helpers.cpp
+++ b/src/core/Helpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -173,3 +173,79 @@
 
     return window;
 }
+
+ValidRegion arm_compute::calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape,
+                                                      InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined)
+{
+    const DataLayout data_layout = src_info.data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    const float scale_x        = static_cast<float>(dst_shape[idx_width]) / src_info.tensor_shape()[idx_width];
+    const float scale_y        = static_cast<float>(dst_shape[idx_height]) / src_info.tensor_shape()[idx_height];
+    const float sampling_point = (sampling_policy == SamplingPolicy::CENTER) ? 0.5f : 0.0f;
+
+    // Get input's valid region start and end points
+    const int valid_start_in_x = src_info.valid_region().anchor[idx_width];
+    const int valid_start_in_y = src_info.valid_region().anchor[idx_height];
+    const int valid_end_in_x   = src_info.valid_region().anchor[idx_width] + src_info.valid_region().shape[idx_width];
+    const int valid_end_in_y   = src_info.valid_region().anchor[idx_height] + src_info.valid_region().shape[idx_height];
+
+    // Initialize output's valid region start and end points
+    auto valid_start_out_x = static_cast<int>(valid_start_in_x * scale_x);
+    auto valid_start_out_y = static_cast<int>(valid_start_in_y * scale_y);
+    auto valid_end_out_x   = std::min<int>(std::ceil(valid_end_in_x * scale_x), dst_shape[idx_width]);
+    auto valid_end_out_y   = std::min<int>(std::ceil(valid_end_in_y * scale_y), dst_shape[idx_height]);
+
+    // Handle valid points in case of the bi-linear interpolation
+    if(border_undefined)
+    {
+        switch(interpolate_policy)
+        {
+            case InterpolationPolicy::NEAREST_NEIGHBOR:
+            {
+                // (start_out + sampling_point) >= (start_in * scale)
+                // start_out = ceil((start_in * scale) - sampling_point)
+                valid_start_out_x = std::ceil(valid_start_in_x * scale_x - sampling_point);
+                valid_start_out_y = std::ceil(valid_start_in_y * scale_y - sampling_point);
+
+                // (end_out - 1 + sampling_point) < (end_in * scale)
+                // end_out   = ceil((end_in * scale) - sampling_point); // <-- ceil(x - 1) strictly less
+                valid_end_out_x = std::ceil(valid_end_in_x * scale_x - sampling_point);
+                valid_end_out_y = std::ceil(valid_end_in_y * scale_y - sampling_point);
+                break;
+            }
+            case InterpolationPolicy::BILINEAR:
+            {
+                // (start_out + sampling_point) >= ((start_in + sampling_point) * scale)
+                // start_out = ceil(((start_in + sampling_point) * scale) - sampling_point)
+                valid_start_out_x = std::ceil((valid_start_in_x + sampling_point) * scale_x - sampling_point);
+                valid_start_out_y = std::ceil((valid_start_in_y + sampling_point) * scale_y - sampling_point);
+
+                // (end_out - 1 + sampling_point) <= ((end_in - 1 + sampling_point) * scale)
+                // end_out   = floor(((end_in - 1 + sampling_point) * scale) - sampling_point + 1)
+                valid_end_out_x = std::floor((valid_end_in_x - 1.f + sampling_point) * scale_x - sampling_point + 1.f);
+                valid_end_out_y = std::floor((valid_end_in_y - 1.f + sampling_point) * scale_y - sampling_point + 1.f);
+                break;
+            }
+            case InterpolationPolicy::AREA:
+                break;
+            default:
+            {
+                ARM_COMPUTE_ERROR("Invalid InterpolationPolicy");
+                break;
+            }
+        }
+    }
+
+    // Setup output valid region
+    ValidRegion valid_region{ Coordinates(), dst_shape, src_info.tensor_shape().num_dimensions() };
+
+    valid_region.anchor.set(idx_width, std::max(0, valid_start_out_x));
+    valid_region.anchor.set(idx_height, std::max(0, valid_start_out_y));
+
+    valid_region.shape.set(idx_width, std::min<size_t>(valid_end_out_x - valid_start_out_x, dst_shape[idx_width]));
+    valid_region.shape.set(idx_height, std::min<size_t>(valid_end_out_y - valid_start_out_y, dst_shape[idx_height]));
+
+    return valid_region;
+}
\ No newline at end of file

diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
index b65c4f4..eb5f072 100644
--- a/src/core/ITensor.cpp
+++ b/src/core/ITensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -151,3 +151,13 @@
         }
     }
 }
+
+bool ITensor::is_used() const
+{
+    return _is_used;
+}
+
+void ITensor::mark_as_unused() const
+{
+    _is_used = false;
+}

diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 1f730a2..6be50fd 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp

@@ -58,20 +58,39 @@
     if(nullptr != output)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var, beta, gamma);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var, beta, gamma);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != mean->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
+    if(beta != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, beta);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
+    }
+    if(gamma != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, gamma);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
 
     return Status{};
 }
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
 {
+    if(output != nullptr)
+    {
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*output, *input->clone());
+    }
+
     unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
 
     Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
@@ -99,13 +118,13 @@
     const int  fixed_point_position = _input->info()->fixed_point_position();
     const auto input_mean           = reinterpret_cast<const qint8_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
     const auto input_var            = reinterpret_cast<const qint8_t *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma          = reinterpret_cast<const qint8_t *>(_gamma->ptr_to_element(Coordinates(0, 0)));
-    const auto input_beta           = reinterpret_cast<const qint8_t *>(_beta->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma          = (_gamma != nullptr) ? reinterpret_cast<const qint8_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta           = (_beta != nullptr) ? reinterpret_cast<const qint8_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     qint8x16_t       mean_vec    = vdupq_n_qs8(0);
     qint8x16_t       var_vec     = vdupq_n_qs8(0);
-    qint8x16_t       gamma_vec   = vdupq_n_qs8(0);
-    qint8x16_t       beta_vec    = vdupq_n_qs8(0);
+    qint8x16_t       gamma_vec   = vdupq_n_qs8(sqcvt_qs8_f32(1, fixed_point_position));
+    qint8x16_t       beta_vec    = vdupq_n_qs8(sqcvt_qs8_f32(0, fixed_point_position));
     qint8x16_t       denominator = vdupq_n_qs8(0);
     const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(_epsilon, fixed_point_position));
     execute_window_loop(window, [&](const Coordinates & id)
@@ -113,10 +132,16 @@
         if(slice != id.z())
         {
             // Conctruct vectors
-            mean_vec  = vdupq_n_qs8(*(input_mean + id.z()));
-            var_vec   = vdupq_n_qs8(*(input_var + id.z()));
-            gamma_vec = vdupq_n_qs8(*(input_gamma + id.z()));
-            beta_vec  = vdupq_n_qs8(*(input_beta + id.z()));
+            mean_vec = vdupq_n_qs8(*(input_mean + id.z()));
+            var_vec  = vdupq_n_qs8(*(input_var + id.z()));
+            if(input_gamma != nullptr)
+            {
+                gamma_vec = vdupq_n_qs8(*(input_gamma + id.z()));
+            }
+            if(input_beta != nullptr)
+            {
+                beta_vec = vdupq_n_qs8(*(input_beta + id.z()));
+            }
 
             // Calculate denominator
             denominator = vqinvsqrtq_qs8(vqaddq_qs8(var_vec, epsilon_vec), fixed_point_position);
@@ -146,13 +171,13 @@
     const int  fixed_point_position = _input->info()->fixed_point_position();
     const auto input_mean           = reinterpret_cast<const qint16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
     const auto input_var            = reinterpret_cast<const qint16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma          = reinterpret_cast<const qint16_t *>(_gamma->ptr_to_element(Coordinates(0, 0)));
-    const auto input_beta           = reinterpret_cast<const qint16_t *>(_beta->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma          = (_gamma != nullptr) ? reinterpret_cast<const qint16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta           = (_beta != nullptr) ? reinterpret_cast<const qint16_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     qint16x8_t       mean_vec    = vdupq_n_qs16(0);
     qint16x8_t       var_vec     = vdupq_n_qs16(0);
-    qint16x8_t       gamma_vec   = vdupq_n_qs16(0);
-    qint16x8_t       beta_vec    = vdupq_n_qs16(0);
+    qint16x8_t       gamma_vec   = vdupq_n_qs16(sqcvt_qs16_f32(1, fixed_point_position));
+    qint16x8_t       beta_vec    = vdupq_n_qs16(sqcvt_qs16_f32(0, fixed_point_position));
     qint16x8_t       denominator = vdupq_n_qs16(0);
     const qint16x8_t epsilon_vec = vdupq_n_qs16(sqcvt_qs16_f32(_epsilon, fixed_point_position));
     execute_window_loop(window, [&](const Coordinates & id)
@@ -160,10 +185,16 @@
         if(slice != id.z())
         {
             // Conctruct vectors
-            mean_vec  = vdupq_n_qs16(*(input_mean + id.z()));
-            var_vec   = vdupq_n_qs16(*(input_var + id.z()));
-            gamma_vec = vdupq_n_qs16(*(input_gamma + id.z()));
-            beta_vec  = vdupq_n_qs16(*(input_beta + id.z()));
+            mean_vec = vdupq_n_qs16(*(input_mean + id.z()));
+            var_vec  = vdupq_n_qs16(*(input_var + id.z()));
+            if(input_gamma != nullptr)
+            {
+                gamma_vec = vdupq_n_qs16(*(input_gamma + id.z()));
+            }
+            if(input_beta != nullptr)
+            {
+                beta_vec = vdupq_n_qs16(*(input_beta + id.z()));
+            }
 
             // Calculate denominator
             denominator = vqinvsqrtq_qs16(vqaddq_qs16(var_vec, epsilon_vec), fixed_point_position);
@@ -179,9 +210,9 @@
 }
 
 template <bool fused_activation>
-void NEBatchNormalizationLayerKernel::batch_normalization_fp16(const Window &window)
+void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw(const Window &window)
 {
-    static_assert(!fused_activation, "Activation is not supported for QS8");
+    static_assert(!fused_activation, "Activation is not supported for FP16");
 
     ARM_COMPUTE_UNUSED(window);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -194,12 +225,12 @@
 
     const auto input_mean  = reinterpret_cast<const float16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
     const auto input_var   = reinterpret_cast<const float16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = reinterpret_cast<const float16_t *>(_gamma->ptr_to_element(Coordinates(0, 0)));
-    const auto input_beta  = reinterpret_cast<const float16_t *>(_beta->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const float16_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     float16x8_t       mean_vec    = vdupq_n_f16(0.0);
     float16x8_t       var_vec     = vdupq_n_f16(0.0);
-    float16x8_t       gamma_vec   = vdupq_n_f16(0.0);
+    float16x8_t       gamma_vec   = vdupq_n_f16(1.0);
     float16x8_t       beta_vec    = vdupq_n_f16(0.0);
     float16x8_t       denominator = vdupq_n_f16(0.0);
     const float16x8_t epsilon_vec = vdupq_n_f16(_epsilon);
@@ -208,10 +239,16 @@
         if(slice != id.z())
         {
             // Conctruct vectors
-            mean_vec  = vdupq_n_f16(*(input_mean + id.z()));
-            var_vec   = vdupq_n_f16(*(input_var + id.z()));
-            gamma_vec = vdupq_n_f16(*(input_gamma + id.z()));
-            beta_vec  = vdupq_n_f16(*(input_beta + id.z()));
+            mean_vec = vdupq_n_f16(*(input_mean + id.z()));
+            var_vec  = vdupq_n_f16(*(input_var + id.z()));
+            if(input_gamma != nullptr)
+            {
+                gamma_vec = vdupq_n_f16(*(input_gamma + id.z()));
+            }
+            if(input_beta != nullptr)
+            {
+                beta_vec = vdupq_n_f16(*(input_beta + id.z()));
+            }
 
             // Calculate denominator
             denominator = vinvsqrtq_f16(vaddq_f16(var_vec, epsilon_vec));
@@ -227,8 +264,43 @@
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
+template <bool fused_activation>
+void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc(const Window &window)
+{
+    static_assert(!fused_activation, "Activation is not supported for FP16");
+
+    ARM_COMPUTE_UNUSED(window);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    const auto input_mean  = reinterpret_cast<const float16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const float16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const float16_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+
+    const float16x8_t epsilon_vec = vdupq_n_f16(_epsilon);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Conctruct vectors
+        const float16x8_t mean_vec  = vld1q_f16(input_mean + id.x());
+        const float16x8_t var_vec   = vld1q_f16(input_var + id.x());
+        const float16x8_t gamma_vec = (input_gamma != nullptr) ? vld1q_f16(input_gamma + id.x()) : vdupq_n_f16(1.0);
+        const float16x8_t beta_vec  = (input_beta != nullptr) ? vld1q_f16(input_beta + id.x()) : vdupq_n_f16(0.0);
+        // Calculate denominator
+        const float16x8_t denominator = vinvsqrtq_f16(vaddq_f16(var_vec, epsilon_vec));
+
+        // Calculate x bar and store results
+        const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), mean_vec);
+        const float16x8_t x_bar     = vmulq_f16(numerator, denominator);
+        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec)));
+    },
+    input, output);
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+}
+
 template <bool fused_activation, typename F>
-void NEBatchNormalizationLayerKernel::batch_normalization_fp32(const Window &window)
+void NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw(const Window &window)
 {
     Iterator input(_input, window);
     Iterator output(_output, window);
@@ -241,12 +313,12 @@
 
     const auto input_mean  = reinterpret_cast<const float *>(_mean->ptr_to_element(Coordinates(0, 0)));
     const auto input_var   = reinterpret_cast<const float *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = reinterpret_cast<const float *>(_gamma->ptr_to_element(Coordinates(0, 0)));
-    const auto input_beta  = reinterpret_cast<const float *>(_beta->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const float *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     float32x4_t       mean_vec    = vdupq_n_f32(0.0);
     float32x4_t       var_vec     = vdupq_n_f32(0.0);
-    float32x4_t       gamma_vec   = vdupq_n_f32(0.0);
+    float32x4_t       gamma_vec   = vdupq_n_f32(1.0);
     float32x4_t       beta_vec    = vdupq_n_f32(0.0);
     float32x4_t       denominator = vdupq_n_f32(0.0);
     const float32x4_t epsilon_vec = vdupq_n_f32(_epsilon);
@@ -255,10 +327,16 @@
         if(slice != id.z())
         {
             // Conctruct vectors
-            mean_vec  = vdupq_n_f32(*(input_mean + id.z()));
-            var_vec   = vdupq_n_f32(*(input_var + id.z()));
-            gamma_vec = vdupq_n_f32(*(input_gamma + id.z()));
-            beta_vec  = vdupq_n_f32(*(input_beta + id.z()));
+            mean_vec = vdupq_n_f32(*(input_mean + id.z()));
+            var_vec  = vdupq_n_f32(*(input_var + id.z()));
+            if(input_gamma != nullptr)
+            {
+                gamma_vec = vdupq_n_f32(*(input_gamma + id.z()));
+            }
+            if(input_beta != nullptr)
+            {
+                beta_vec = vdupq_n_f32(*(input_beta + id.z()));
+            }
 
             // Calculate denominator
             denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
@@ -282,8 +360,50 @@
     input, output);
 }
 
+template <bool fused_activation, typename F>
+void NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc(const Window &window)
+{
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    F activation_functor(_act_info);
+
+    const auto input_mean  = reinterpret_cast<const float *>(_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const float *>(_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const float *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+
+    const float32x4_t epsilon_vec = vdupq_n_f32(_epsilon);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Conctruct vectors
+        const float32x4_t mean_vec  = vld1q_f32(input_mean + id.x());
+        const float32x4_t var_vec   = vld1q_f32(input_var + id.x());
+        const float32x4_t gamma_vec = (input_gamma != nullptr) ? vld1q_f32(input_gamma + id.x()) : vdupq_n_f32(1.0);
+        const float32x4_t beta_vec  = (input_beta != nullptr) ? vld1q_f32(input_beta + id.x()) : vdupq_n_f32(0.0);
+        // Calculate denominator
+        const float32x4_t denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
+
+        // Calculate x bar
+        const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), mean_vec);
+        const float32x4_t x_bar     = vmulq_f32(numerator, denominator);
+        float32x4_t       res       = vmlaq_f32(beta_vec, x_bar, gamma_vec);
+
+        // Perform fused activation
+        if(fused_activation)
+        {
+            activation_functor(res);
+        }
+
+        // Store results
+        vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
+    },
+    input, output);
+}
+
 void NEBatchNormalizationLayerKernel::configure_non_fused()
 {
+    const bool is_nhwc = _input->info()->data_layout() == DataLayout::NHWC;
     switch(_input->info()->data_type())
     {
         case DataType::QS8:
@@ -293,10 +413,11 @@
             _func = &NEBatchNormalizationLayerKernel::batch_normalization_qs16<false>;
             break;
         case DataType::F16:
-            _func = &NEBatchNormalizationLayerKernel::batch_normalization_fp16<false>;
+            _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<false> : &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<false>;
             break;
         case DataType::F32:
-            _func = &NEBatchNormalizationLayerKernel::batch_normalization_fp32<false, ::detail::dummy<float, 4>>;
+            _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<false, ::detail::dummy<float, 4>> :
+                    &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<false, ::detail::dummy<float, 4>>;
             break;
         default:
             ARM_COMPUTE_ERROR("Element size not supported");
@@ -306,18 +427,25 @@
 
 void NEBatchNormalizationLayerKernel::configure_fused()
 {
-    // Fused Batched Normalization with activation functions : FP32
-    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32 =
+    // NCHW Fused Batched Normalization with activation functions : FP32
+    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw =
     {
-        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32<true, ::detail::relu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32<true, ::detail::brelu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32<true, ::detail::lubrelu<float, 4>> }
+        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::relu<float, 4>> },
+        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::brelu<float, 4>> },
+        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::lubrelu<float, 4>> }
+    };
+    // NHWC Fused Batched Normalization with activation functions : FP32
+    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nhwc =
+    {
+        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::relu<float, 4>> },
+        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::brelu<float, 4>> },
+        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::lubrelu<float, 4>> }
     };
 
     switch(_input->info()->data_type())
     {
         case DataType::F32:
-            _func = bn_fused_map_f32[_act_info.activation()];
+            _func = (_input->info()->data_layout() == DataLayout::NHWC) ? bn_fused_map_f32_nhwc[_act_info.activation()] : bn_fused_map_f32_nchw[_act_info.activation()];
             break;
         default:
             ARM_COMPUTE_ERROR("Element size not supported");
@@ -335,21 +463,12 @@
                                                 const ITensor *beta, const ITensor *gamma,
                                                 float epsilon, ActivationLayerInfo act_info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var, beta, gamma);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
 
-    ITensorInfo *output_info = nullptr;
-
-    if(nullptr != output)
-    {
-        // Output tensor auto initialization if not yet initialized
-        auto_init_if_empty(*output->info(), *input->info());
-
-        output_info = output->info();
-    }
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output_info,
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr,
                                                   mean->info(), var->info(),
-                                                  beta->info(), gamma->info(),
+                                                  (beta != nullptr) ? beta->info() : nullptr,
+                                                  (gamma != nullptr) ? gamma->info() : nullptr,
                                                   epsilon, act_info));
 
     _input    = input;
@@ -361,7 +480,8 @@
     _epsilon  = epsilon;
     _act_info = act_info;
 
-    if(output != nullptr)
+    const bool run_in_place = (output == nullptr) || (output == input);
+    if(!run_in_place)
     {
         _output = output;
     }
@@ -377,7 +497,7 @@
     }
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output_info);
+    auto win_config = validate_and_configure_window(input->info(), (run_in_place) ? nullptr : output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }

diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
index a2b24de..28fb4bd 100644
--- a/src/core/NEON/kernels/NEChannelCombineKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,47 +56,58 @@
     ARM_COMPUTE_ERROR_ON(plane1 == output);
     ARM_COMPUTE_ERROR_ON(plane2 == output);
 
-    set_format_if_unknown(*plane0->info(), Format::U8);
-    set_format_if_unknown(*plane1->info(), Format::U8);
-    set_format_if_unknown(*plane2->info(), Format::U8);
-
-    if(plane3 != nullptr)
-    {
-        set_format_if_unknown(*plane3->info(), Format::U8);
-    }
-
-    set_shape_if_empty(*output->info(), plane0->info()->tensor_shape());
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422);
 
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane1, plane2);
 
-    if(plane3 != nullptr)
+    const Format output_format = output->info()->format();
+
+    // Check if horizontal dimension of Y plane is even and validate horizontal sub-sampling dimensions for U and V planes
+    if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
     {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane0, plane3);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane3);
+        // Validate Y plane of input and output
+        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output);
+
+        // Validate U and V plane of the input
+        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
     }
 
-    const Format &output_format = output->info()->format();
+    _planes[0] = plane0;
+    _planes[1] = plane1;
+    _planes[2] = plane2;
+    _planes[3] = nullptr;
 
-    if(output_format == Format::RGBA8888)
+    // Validate the last input tensor only for RGBA format
+    if(Format::RGBA8888 == output_format)
     {
-        ARM_COMPUTE_ERROR_ON(plane3 == output);
+        ARM_COMPUTE_ERROR_ON_NULLPTR(plane3);
+        ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane3);
+
+        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane3, Format::U8);
         ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane3, 1, DataType::U8);
+
+        _planes[3] = plane3;
     }
 
-    _planes[0]    = plane0;
-    _planes[1]    = plane1;
-    _planes[2]    = plane2;
-    _planes[3]    = plane3;
     _output       = output;
     _output_multi = nullptr;
 
+    // Half the processed elements for U and V channels due to horizontal sub-sampling of 2
+    if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
+    {
+        _x_subsampling[1] = 2;
+        _x_subsampling[2] = 2;
+    }
+
     _num_elems_processed_per_iteration = 8;
     _is_parallelizable                 = true;
 
+    // Select function and number of elements to process given the output format
     switch(output_format)
     {
         case Format::RGB888:
@@ -106,14 +117,10 @@
             _func = &NEChannelCombineKernel::combine_4C;
             break;
         case Format::UYVY422:
-            _x_subsampling[1]                  = 2;
-            _x_subsampling[2]                  = 2;
             _num_elems_processed_per_iteration = 16;
             _func                              = &NEChannelCombineKernel::combine_YUV_1p<true>;
             break;
         case Format::YUYV422:
-            _x_subsampling[1]                  = 2;
-            _x_subsampling[2]                  = 2;
             _num_elems_processed_per_iteration = 16;
             _func                              = &NEChannelCombineKernel::combine_YUV_1p<false>;
             break;
@@ -122,14 +129,6 @@
             break;
     }
 
-    TensorShape subsampled_shape_plane1{ plane0->info()->tensor_shape() };
-    subsampled_shape_plane1.set(0, subsampled_shape_plane1[0] / _x_subsampling[1]);
-    TensorShape subsampled_shape_plane2{ plane0->info()->tensor_shape() };
-    subsampled_shape_plane2.set(0, subsampled_shape_plane2[0] / _x_subsampling[2]);
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane1->info()->tensor_shape(), subsampled_shape_plane1);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane2->info()->tensor_shape(), subsampled_shape_plane2);
-
     Window win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration));
 
     AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
@@ -167,65 +166,52 @@
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
 
-    set_format_if_unknown(*plane0->info(), Format::U8);
-    set_format_if_unknown(*plane1->info(), Format::U8);
-    set_format_if_unknown(*plane2->info(), Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
+    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
 
-    set_shape_if_empty(*output->plane(0)->info(), plane0->info()->tensor_shape());
-
-    switch(output->info()->format())
-    {
-        case Format::NV12:
-        case Format::NV21:
-        case Format::IYUV:
-        {
-            TensorShape subsampled_shape = plane0->info()->tensor_shape();
-            subsampled_shape.set(0, subsampled_shape[0] / 2);
-            subsampled_shape.set(1, subsampled_shape[1] / 2);
-
-            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
-
-            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->plane(1)->info()->tensor_shape(), subsampled_shape);
-
-            if(output->info()->format() == Format::IYUV)
-            {
-                set_shape_if_empty(*output->plane(2)->info(), subsampled_shape);
-
-                ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->plane(2)->info()->tensor_shape(), subsampled_shape);
-            }
-            break;
-        }
-        case Format::YUV444:
-            set_shape_if_empty(*output->plane(1)->info(), plane0->info()->tensor_shape());
-            set_shape_if_empty(*output->plane(2)->info(), plane0->info()->tensor_shape());
-
-            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane1, plane2, output->plane(1), output->plane(2));
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported format");
-    }
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane0, output->plane(0));
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane1, plane2);
 
-    _planes[0]                            = plane0;
-    _planes[1]                            = plane1;
-    _planes[2]                            = plane2;
-    _planes[3]                            = nullptr;
-    _output                               = nullptr;
-    _output_multi                         = output;
+    const Format output_format = output->info()->format();
+
+    // Validate shape of Y plane to be even and shape of sub-sampling dimensions for U and V planes
+    // Perform validation only for formats which require sub-sampling.
+    if(Format::YUV444 != output_format)
+    {
+        // Validate Y plane of input and output
+        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output->plane(0));
+
+        // Validate U and V plane of the input
+        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
+
+        // Validate second plane U (NV12 and NV21 have a UV88 combined plane while IYUV has only the U plane)
+        // MultiImage generates the correct tensor shape but also check in case the tensor shape of planes was changed to a wrong size
+        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(1));
+
+        // Validate the last plane V of format IYUV
+        if(Format::IYUV == output_format)
+        {
+            // Validate Y plane of the output
+            ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(2));
+        }
+    }
+
+    _planes[0]    = plane0;
+    _planes[1]    = plane1;
+    _planes[2]    = plane2;
+    _planes[3]    = nullptr;
+    _output       = nullptr;
+    _output_multi = output;
+
     bool         has_two_planes           = false;
     unsigned int num_elems_written_plane1 = 8;
 
     _num_elems_processed_per_iteration = 8;
     _is_parallelizable                 = true;
 
-    const Format &output_format = output->info()->format();
-
     switch(output_format)
     {
         case Format::NV12:
@@ -268,8 +254,7 @@
                               output_plane1_access,
                               output_plane2_access);
 
-    ValidRegion plane0_valid_region = plane0->info()->valid_region();
-
+    ValidRegion plane0_valid_region  = plane0->info()->valid_region();
     ValidRegion output_plane1_region = has_two_planes ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region();
 
     output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape()));
@@ -358,7 +343,7 @@
 {
     // Create sub-sampled uv window and init uv planes
     Window win_uv(win);
-    win_uv.set_dimension_step(0, win.x().step() / _x_subsampling[1]);
+    win_uv.set_dimension_step(Window::DimX, win.x().step() / _x_subsampling[1]);
     win_uv.validate();
 
     Iterator p0(_planes[0], win);
@@ -405,13 +390,13 @@
 
     // Update UV window
     Window uv_win(win);
-    uv_win.set(Window::DimX, Window::Dimension(uv_win.x().start() / _x_subsampling[1], uv_win.x().end() / _x_subsampling[1], _num_elems_processed_per_iteration));
+    uv_win.set(Window::DimX, Window::Dimension(uv_win.x().start() / _x_subsampling[1], uv_win.x().end() / _x_subsampling[1], uv_win.x().step() / _x_subsampling[1]));
     uv_win.set(Window::DimY, Window::Dimension(uv_win.y().start() / _y_subsampling[1], uv_win.y().end() / _y_subsampling[1], 1));
     uv_win.validate();
 
     // Update output win
     Window out_win(win);
-    out_win.set(Window::DimX, Window::Dimension(out_win.x().start(), out_win.x().end(), out_win.x().step() * 2));
+    out_win.set(Window::DimX, Window::Dimension(out_win.x().start(), out_win.x().end(), out_win.x().step() / _x_subsampling[1]));
     out_win.set(Window::DimY, Window::Dimension(out_win.y().start() / _y_subsampling[1], out_win.y().end() / _y_subsampling[1], 1));
     out_win.validate();
 
@@ -421,6 +406,9 @@
     Iterator  p2(_planes[2 - shift], uv_win);
     Iterator  out(_output_multi->plane(1), out_win);
 
+    // Increase step size after iterator is created to calculate stride correctly for multi channel format
+    out_win.set_dimension_step(Window::DimX, out_win.x().step() * _x_subsampling[1]);
+
     execute_window_loop(out_win, [&](const Coordinates & id)
     {
         const uint8x8x2_t pixels =
@@ -450,19 +438,17 @@
 
     // Update window
     Window tmp_win(win);
-    tmp_win.set(Window::DimX, Window::Dimension(tmp_win.x().start() / _x_subsampling[plane_id], tmp_win.x().end() / _x_subsampling[plane_id], _num_elems_processed_per_iteration));
+    tmp_win.set(Window::DimX, Window::Dimension(tmp_win.x().start() / _x_subsampling[plane_id], tmp_win.x().end() / _x_subsampling[plane_id], tmp_win.x().step() / _x_subsampling[plane_id]));
     tmp_win.set(Window::DimY, Window::Dimension(tmp_win.y().start() / _y_subsampling[plane_id], tmp_win.y().end() / _y_subsampling[plane_id], 1));
-    tmp_win.validate();
 
     Iterator in(_planes[plane_id], tmp_win);
     Iterator out(_output_multi->plane(plane_id), tmp_win);
 
     execute_window_loop(tmp_win, [&](const Coordinates & id)
     {
-        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
-        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+        const uint8x8_t pixels = vld1_u8(in.ptr());
 
-        vst1_u8(out_ptr, vld1_u8(in_ptr));
+        vst1_u8(out.ptr(), pixels);
     },
     in, out);
 }

diff --git a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
new file mode 100644
index 0000000..b3746bd
--- /dev/null
+++ b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp

@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+
+using namespace arm_compute;
+
+NEConvertFullyConnectedWeightsKernel::NEConvertFullyConnectedWeightsKernel()
+    : _input(nullptr), _output(nullptr), _factor1(0), _factor2(0)
+{
+}
+
+void NEConvertFullyConnectedWeightsKernel::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape,
+                                                     DataLayout data_layout)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NEConvertFullyConnectedWeightsKernel::validate(input->info(), output->info(), original_input_shape, data_layout));
+
+    _input  = input;
+    _output = output;
+
+    const unsigned int num_elems_per_input_plane = original_input_shape.x() * original_input_shape.y();
+    const unsigned int num_channels              = original_input_shape.z();
+
+    // Set build options
+    if(data_layout == DataLayout::NCHW)
+    {
+        _factor1 = num_elems_per_input_plane;
+        _factor2 = num_channels;
+    }
+    else
+    {
+        _factor1 = num_channels;
+        _factor2 = num_elems_per_input_plane;
+    }
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+    INEKernel::configure(win);
+}
+
+Status NEConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
+                                                      DataLayout data_layout)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32,
+                                                         DataType::QS32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != original_input_shape.total_size_lower(3));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
+
+    return Status{};
+}
+
+template <typename T>
+void NEConvertFullyConnectedWeightsKernel::run_convert_fc_weights(const Window &window)
+{
+    const unsigned int dst_stride_x = _output->info()->strides_in_bytes().x();
+    const unsigned int dst_stride_y = _output->info()->strides_in_bytes().y();
+
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        *reinterpret_cast<T *>(output.ptr() + id.x() * dst_stride_x + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y) = *reinterpret_cast<T *>(input.ptr());
+    },
+    input);
+}
+
+void NEConvertFullyConnectedWeightsKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    switch(_input->info()->element_size())
+    {
+        case 1:
+            run_convert_fc_weights<uint8_t>(window);
+            break;
+        case 2:
+            run_convert_fc_weights<uint16_t>(window);
+            break;
+        case 4:
+            run_convert_fc_weights<uint32_t>(window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported.");
+            break;
+    }
+}

diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index f5ee608..8cdf175 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp

@@ -52,13 +52,14 @@
 {
 public:
     static void convolve(const Window &window, unsigned int num_elems_written_per_iteration,
-                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
     {
         const int input_offset   = -input->info()->quantization_info().offset;
         const int weights_offset = -weights->info()->quantization_info().offset;
 
         const int          input_stride_x  = input->info()->strides_in_bytes().x();
         const int          input_stride_y  = input->info()->strides_in_bytes().y();
+        const int          input_stride_z  = input->info()->strides_in_bytes().z();
         const int          output_stride_y = output->info()->strides_in_bytes().y();
         const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
         const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
@@ -93,7 +94,7 @@
             int ih = 0;
             int oh = 0;
 
-            const uint8_t *input_ptr        = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;
+            const uint8_t *input_ptr        = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y - (id.z() - id.z() / depth_multiplier) * input_stride_z;
             const uint8_t *ptr_weights_base = weights_ptr + id.z() * kernel_stride_z;
 
             const auto ptr_weights_r0 = reinterpret_cast<const T1 *>(ptr_weights_base);
@@ -125,19 +126,19 @@
 
 template <typename T1, typename T2>
 inline void convolve_3x3(const Window &window, unsigned int num_elems_written_per_iteration,
-                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
 {
     const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
     switch(conv_stride_x)
     {
         case 1:
-            convolver_3x3<T1, T2, 1>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info);
+            convolver_3x3<T1, T2, 1>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier);
             break;
         case 2:
-            convolver_3x3<T1, T2, 2>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info);
+            convolver_3x3<T1, T2, 2>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier);
             break;
         case 3:
-            convolver_3x3<T1, T2, 3>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info);
+            convolver_3x3<T1, T2, 3>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier);
             break;
         default:
             ARM_COMPUTE_ERROR("Not implemented");
@@ -146,7 +147,7 @@
 } // namespace
 
 NEDepthwiseConvolutionLayer3x3Kernel::NEDepthwiseConvolutionLayer3x3Kernel()
-    : _border_size(0), _input(), _output(), _weights(), _conv_info(), _convolver(nullptr), _num_elems_written_per_iteration(0), _run_optimized(false)
+    : _border_size(0), _input(), _output(), _weights(), _conv_info(), _convolver(nullptr), _num_elems_written_per_iteration(0), _run_optimized(false), _depth_multiplier(1)
 {
 }
 
@@ -155,20 +156,22 @@
     return _border_size;
 }
 
-void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, DataLayout data_layout)
+void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+                                                     DataLayout data_layout)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
-    _input     = input;
-    _output    = output;
-    _weights   = weights;
-    _conv_info = conv_info;
-    _convolver = nullptr;
+    _input            = input;
+    _output           = output;
+    _weights          = weights;
+    _conv_info        = conv_info;
+    _depth_multiplier = depth_multiplier;
+    _convolver        = nullptr;
 
     _run_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
                                                                                            conv_info,
-                                                                                           input->info()->data_type(),
+                                                                                           input->info()->data_type(), depth_multiplier,
                                                                                            data_layout);
 
     (_run_optimized) ? configure_optimized() : configure_generic();
@@ -182,7 +185,7 @@
     (_run_optimized) ? run_optimized(window, info) : run_generic(window, info);
 }
 
-bool NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(TensorShape input_shape, PadStrideInfo conv_info, DataType dt, DataLayout data_layout)
+bool NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(TensorShape input_shape, PadStrideInfo conv_info, DataType dt, unsigned int depth_multiplier, DataLayout data_layout)
 {
     // Reshape input shape if in NHWC format
     TensorShape in_shape{ input_shape };
@@ -210,7 +213,7 @@
     bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
     bool          supported_padding = is_same_padding || is_valid_padding;
 
-    return supported_datatype && supported_strides && supported_padding;
+    return supported_datatype && supported_strides && supported_padding && (depth_multiplier == 1);
 }
 
 void NEDepthwiseConvolutionLayer3x3Kernel::generate_convolver()
@@ -219,8 +222,7 @@
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(_input, _weights);
     ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3);
 
-    _convolver = create_convolver_object(_input->info()->tensor_shape(), _conv_info,
-                                         _weights->buffer(), _input->buffer(), _output->buffer());
+    _convolver = create_convolver_object(_conv_info, _weights, _input, _output, true);
 }
 
 void NEDepthwiseConvolutionLayer3x3Kernel::configure_generic()
@@ -228,7 +230,7 @@
     ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(0) != 3 || _weights->info()->dimension(1) != 3);
 
     // Get convolved dimensions
-    const TensorShape output_shape = compute_depthwise_convolution_shape(*_input->info(), *_weights->info(), _conv_info);
+    const TensorShape output_shape = compute_depthwise_convolution_shape(*_input->info(), *_weights->info(), _conv_info, _depth_multiplier);
     const DataType    output_dt    = (_input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : _input->info()->data_type();
 
     // Output auto inizialitation if not yet initialized
@@ -282,8 +284,7 @@
     ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3);
 
     _border_size = BorderSize(0, 0);
-    _convolver   = create_convolver_object(_input->info()->tensor_shape(), _conv_info,
-                                           _weights->buffer(), _input->buffer(), _output->buffer());
+    _convolver   = create_convolver_object(_conv_info, _weights, _input, _output);
 
     // Auto-configure output
     bool        same_padding = _conv_info.has_padding();
@@ -296,6 +297,15 @@
     auto_init_if_empty(*_output->info(),
                        _input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
 
+    // Set padding in channels
+    const int num_channels = _weights->info()->dimension(0);
+    if((num_channels >= 128) && (num_channels % 16 == 0))
+    {
+        _input->info()->extend_padding(PaddingSize(0, 4, 0, 0));
+        _weights->info()->extend_padding(PaddingSize(0, 4, 0, 0));
+        _output->info()->extend_padding(PaddingSize(0, 4, 0, 0));
+    }
+
     // Configure window
     Window win;
     auto   win_last = _convolver->get_window();
@@ -310,10 +320,10 @@
     switch(_input->info()->data_type())
     {
         case DataType::F32:
-            convolve_3x3<float, float>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            convolve_3x3<float, float>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier);
             break;
         case DataType::QASYMM8:
-            convolve_3x3<uint8_t, int32_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            convolve_3x3<uint8_t, int32_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier);
             break;
         default:
             ARM_COMPUTE_ERROR("Not implemented");
@@ -330,41 +340,56 @@
     _convolver->run(start, end);
 }
 
-std::unique_ptr<depthwise::IDepthwiseConvolution> NEDepthwiseConvolutionLayer3x3Kernel::create_convolver_object(TensorShape    shape,
-                                                                                                                PadStrideInfo  conv_info,
-                                                                                                                const uint8_t *w_ptr,
-                                                                                                                uint8_t       *in_ptr,
-                                                                                                                uint8_t       *out_ptr)
+std::unique_ptr<depthwise::IDepthwiseConvolution> NEDepthwiseConvolutionLayer3x3Kernel::create_convolver_object(PadStrideInfo  conv_info,
+                                                                                                                const ITensor *w,
+                                                                                                                const ITensor *in,
+                                                                                                                ITensor       *out,
+                                                                                                                bool           setup_strides)
 {
-    const int  in_rows      = shape.z();
-    const int  in_cols      = shape.y();
-    const int  n_batches    = shape[3];
-    const int  n_channels   = shape.x();
-    const bool padding_same = conv_info.has_padding();
+    const TensorShape shape               = in->info()->tensor_shape();
+    const int         in_rows             = shape.z();
+    const int         in_cols             = shape.y();
+    const int         n_batches           = shape[3];
+    const int         n_channels          = shape.x();
+    const bool        padding_same        = conv_info.has_padding();
+    const int         weight_col_stride   = (setup_strides) ? w->info()->strides_in_bytes().y() / w->info()->element_size() : 0;
+    const int         weight_row_stride   = (setup_strides) ? w->info()->strides_in_bytes().z() / w->info()->element_size() : 0;
+    const int         input_col_stride    = (setup_strides) ? in->info()->strides_in_bytes().y() / in->info()->element_size() : 0;
+    const int         input_row_stride    = (setup_strides) ? in->info()->strides_in_bytes().z() / in->info()->element_size() : 0;
+    const int         input_batch_stride  = (setup_strides) ? in->info()->strides_in_bytes()[3] / in->info()->element_size() : 0;
+    const int         output_col_stride   = (setup_strides) ? out->info()->strides_in_bytes().y() / out->info()->element_size() : 0;
+    const int         output_row_stride   = (setup_strides) ? out->info()->strides_in_bytes().z() / out->info()->element_size() : 0;
+    const int         output_batch_stride = (setup_strides) ? out->info()->strides_in_bytes()[3] / out->info()->element_size() : 0;
 
     const auto stride_x = conv_info.stride().first;
     switch(stride_x)
     {
         case 1:
-            return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float>>(
+            return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>>(
                        n_batches,
                        in_rows,
                        in_cols,
                        n_channels,
                        padding_same,
-                       reinterpret_cast<const float *>(w_ptr),
-                       reinterpret_cast<float *>(in_ptr),
-                       reinterpret_cast<float *>(out_ptr));
+                       reinterpret_cast<const float *>(w->ptr_to_element(Coordinates())),
+                       reinterpret_cast<float *>(in->ptr_to_element(Coordinates())),
+                       reinterpret_cast<float *>(out->ptr_to_element(Coordinates())),
+                       weight_col_stride, weight_row_stride,
+                       input_col_stride, input_row_stride, input_batch_stride,
+                       output_col_stride, output_row_stride, output_batch_stride);
         case 2:
-            return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float>>(
+            return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>>(
                        n_batches,
                        in_rows,
                        in_cols,
                        n_channels,
                        padding_same,
-                       reinterpret_cast<const float *>(w_ptr),
-                       reinterpret_cast<float *>(in_ptr),
-                       reinterpret_cast<float *>(out_ptr));
+                       reinterpret_cast<const float *>(w->ptr_to_element(Coordinates())),
+                       reinterpret_cast<float *>(in->ptr_to_element(Coordinates())),
+                       reinterpret_cast<float *>(out->ptr_to_element(Coordinates())),
+                       weight_col_stride, weight_row_stride,
+                       input_col_stride, input_row_stride, input_batch_stride,
+                       output_col_stride, output_row_stride, output_batch_stride);
         default:
             return nullptr;
     }

diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
index b924d9f..cfd8eac 100644
--- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp

@@ -85,7 +85,7 @@
         const int src_y = -pad_top + src_pixel_linear / max_initial_x * stride_y;
 
         // Get pointers
-        const uint8_t *const input_ptr  = in.ptr() + id.z() * input_stride_z;
+        const uint8_t *const input_ptr  = in.ptr() + id.z() / _depth_multiplier * input_stride_z;
         auto                 output_ptr = reinterpret_cast<T *>(out.ptr());
         const int            height     = src_y + _kernel_dims.height;
         const int            width      = src_x + _kernel_dims.width;
@@ -114,24 +114,25 @@
 }
 
 NEDepthwiseIm2ColKernel::NEDepthwiseIm2ColKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _kernel_dims(), _conv_info(), _has_bias()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _kernel_dims(), _conv_info(), _has_bias(), _depth_multiplier(1)
 {
 }
 
-void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && has_bias);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON((input->info()->dimension(2) * depth_multiplier) != output->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
 
-    _input       = input;
-    _output      = output;
-    _kernel_dims = kernel_dims;
-    _conv_info   = conv_info;
-    _has_bias    = has_bias;
+    _input            = input;
+    _output           = output;
+    _kernel_dims      = kernel_dims;
+    _conv_info        = conv_info;
+    _has_bias         = has_bias;
+    _depth_multiplier = depth_multiplier;
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps());

diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
index be211b2..4120e5f 100644
--- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp

@@ -34,6 +34,46 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
+
+    if(output->tensor_shape().total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
+{
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32, 0);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+
+    // Configure window
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     min_max_access(min_max, 0, 0, 2, min_max->dimension(1));
+
+    // Update window and padding
+    bool window_changed = update_window_and_padding(win, input_access, output_access, min_max_access);
+
+    output_access.set_valid_region(win, input->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_tuple(err, win);
+}
+} // namespace
+
 NEDequantizationLayerKernel::NEDequantizationLayerKernel()
     : _input(nullptr), _output(nullptr), _min_max(nullptr)
 {
@@ -41,34 +81,27 @@
 
 void NEDequantizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(min_max);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::F32, 0);
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), min_max->info()));
 
     _input   = input;
     _output  = output;
     _min_max = min_max;
 
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), min_max->info());
 
-    // Configure window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     min_max_access(min_max->info(), 0, 0, 2, min_max->info()->dimension(1));
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
-    // Update window and padding
-    update_window_and_padding(win, input_access, output_access, min_max_access);
-    output_access.set_valid_region(win, input->info()->valid_region());
+    INEKernel::configure(std::get<1>(win_config));
+}
 
-    INEKernel::configure(win);
+Status NEDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, min_max));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), min_max->clone().get())));
+
+    return Status{};
 }
 
 void NEDequantizationLayerKernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 285ec2d..5eafdf0 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp

@@ -33,6 +33,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <algorithm>
 #include <arm_neon.h>
@@ -663,6 +664,118 @@
     vst1_qs16(buffer, vqadd_qs16(vld1_qs16(buffer), vget_low_s16(values.val[0])));
 }
 
+template <typename T1>
+class convolver_nhwc
+{
+public:
+    static void convolve(const Window &window, int kernel_size, unsigned int num_elems_read_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+    {
+        const int          input_width     = input->info()->dimension(0);
+        const int          input_depth     = input->info()->dimension(2);
+        const int          input_stride_x  = input->info()->strides_in_bytes().x();
+        const int          input_stride_y  = input->info()->strides_in_bytes().y();
+        const int          input_stride_z  = input->info()->strides_in_bytes().z();
+        const int          output_stride_x = output->info()->strides_in_bytes().x();
+        const int          kernel_stride_x = weights->info()->strides_in_bytes().x();
+        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
+        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
+        const int          conv_pad_top    = conv_info.pad_top();
+        const unsigned int conv_stride_x   = std::get<0>(conv_info.stride());
+        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
+        const T1           zero            = 0;
+
+        // Setup input window for the input iterator
+        Window window_in = window;
+        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        // Setup input window for the output iterator
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        // Setup input window for the weights iterator
+        Window window_k = calculate_max_window(*weights->info(), Steps());
+        window_k.set(Window::DimX, Window::Dimension(0, 1, 1));
+        window_k.set(Window::DimY, Window::Dimension(0, 1, 1));
+        window_k.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        window_k.set(3, Window::Dimension(0, weights->info()->dimension(3), 1));
+
+        Iterator in(input, window_in);
+        Iterator out(output, window_out);
+        Iterator k(weights, window_k);
+
+        execute_window_loop(window_k, [&](const Coordinates & id_k)
+        {
+            execute_window_loop(window_out, [&](const Coordinates & id)
+            {
+                const auto in_y = static_cast<int>(id.y() * conv_stride_x - conv_info.pad_left());
+                const auto in_z = static_cast<int>(id.z() * conv_stride_y - conv_pad_top);
+
+                const uint8_t *in_ptr  = in.ptr() + in_y * input_stride_y + in_z * input_stride_z;
+                uint8_t       *out_ptr = out.ptr() + id_k[3] * output_stride_x;
+
+                T1 out_val = 0;
+
+                auto in_addr_base0 = in_ptr;
+                auto we_addr_base0 = k.ptr();
+
+                for(int z = 0; z < kernel_size; ++z, in_addr_base0 += input_stride_z, we_addr_base0 += kernel_stride_z)
+                {
+                    const int in_z = id.z() * conv_stride_y + z - conv_pad_top;
+
+                    if(in_z >= 0 && in_z < input_depth) // If false, pad top/bottom
+                    {
+                        auto in_addr_base1 = in_addr_base0;
+                        auto we_addr_base1 = we_addr_base0;
+
+                        for(int y = 0; y < kernel_size; ++y, in_addr_base1 += input_stride_y, we_addr_base1 += kernel_stride_y)
+                        {
+                            auto out_values = internal_vdupq_n(zero);
+
+                            int x           = 0;
+                            int no_leftover = input_width - num_elems_read_per_iteration;
+
+                            for(; x < no_leftover; x += num_elems_read_per_iteration)
+                            {
+                                const auto in_addr   = reinterpret_cast<const T1 *>(in_addr_base1 + x * input_stride_x);
+                                const auto in_values = internal_vld1q<1>(in_addr);
+
+                                const auto we_addr   = reinterpret_cast<const T1 *>(we_addr_base1 + x * kernel_stride_x);
+                                const auto we_values = internal_vld1q<1>(we_addr);
+
+                                out_values = internal_vmlal(out_values, in_values, we_values, 0);
+                            }
+
+                            out_val += out_values[0];
+                            out_val += out_values[1];
+                            out_val += out_values[2];
+                            out_val += out_values[3];
+
+                            // Leftover
+                            for(; x < input_width; ++x)
+                            {
+                                const auto in_addr  = reinterpret_cast<const T1 *>(in_addr_base1 + x * input_stride_x);
+                                const auto in_value = *(in_addr);
+
+                                const auto we_addr  = reinterpret_cast<const T1 *>(we_addr_base1 + x * kernel_stride_x);
+                                const auto we_value = *(we_addr);
+
+                                out_val += in_value * we_value;
+                            }
+                        }
+                    }
+                }
+
+                *(reinterpret_cast<T1 *>(out_ptr)) = out_val;
+            },
+            in, out);
+        },
+        k);
+    }
+};
+
 template <typename T1, typename T2, unsigned int stridex>
 class convolver_3x3
 {
@@ -1003,35 +1116,28 @@
     }
 }
 
-inline TensorShape get_convolved_dimensions(const ITensorInfo *input, const ITensorInfo *weights, const int kernel_size, const PadStrideInfo &conv_info)
-{
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
-    std::tie(output_width, output_height) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_size, kernel_size, conv_info);
-
-    TensorShape output_shape = input->tensor_shape();
-    output_shape.set(0, output_width);
-    output_shape.set(1, output_height);
-    output_shape.set(2, weights->dimension(3));
-
-    return output_shape;
-}
-
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
+    const DataLayout data_layout = input->data_layout();
+    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != input->dimension(channel_idx));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && input->data_type() != DataType::F32);
 
     // Checks performed when output is configured
     if(output->total_size() != 0)
     {
-        TensorShape output_shape = get_convolved_dimensions(input, weights, weights->dimension(0), conv_info);
+        TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
 
         DataType data_type = input->data_type();
         if(is_data_type_fixed_point(data_type))
@@ -1050,101 +1156,127 @@
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row,
                                                         unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size)
 {
+    ARM_COMPUTE_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+
+    const DataLayout data_layout = input->data_layout();
+    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+
     // Calculate right and bottom border
-    unsigned int kernel_size   = weights->dimension(0);
+    unsigned int kernel_size   = weights->dimension(width_idx);
     const int    conv_stride_x = std::get<0>(conv_info.stride());
     const int    conv_stride_y = std::get<1>(conv_info.stride());
-    const int    input_width   = input->dimension(0);
+    const int    input_width   = input->dimension(width_idx);
 
-    switch(kernel_size)
+    Window win{};
+    bool   window_changed = false;
+
+    if(data_layout == DataLayout::NCHW)
     {
-        case 1:
+        switch(kernel_size)
         {
-            switch(input->data_type())
+            case 1:
             {
+                switch(input->data_type())
+                {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
+                    case DataType::F16:
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                case DataType::QS8:
-                case DataType::QS16:
-                    num_elems_written_per_iteration = 8;
-                    break;
-                case DataType::F32:
-                    if(run_optim_small_tensor_info(input))
-                    {
+                    case DataType::QS8:
+                    case DataType::QS16:
                         num_elems_written_per_iteration = 8;
-                    }
-                    else
-                    {
-                        num_elems_written_per_iteration = 4;
-                    }
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Data type not supported.");
-                    break;
+                        break;
+                    case DataType::F32:
+                        if(run_optim_small_tensor_info(input))
+                        {
+                            num_elems_written_per_iteration = 8;
+                        }
+                        else
+                        {
+                            num_elems_written_per_iteration = 4;
+                        }
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Data type not supported.");
+                        break;
+                }
+                num_weight_elems_read_per_row = kernel_size;
+                num_elems_read_per_iteration  = conv_stride_x * num_elems_written_per_iteration;
+                break;
             }
-            num_weight_elems_read_per_row = kernel_size;
-            num_elems_read_per_iteration  = conv_stride_x * num_elems_written_per_iteration;
-            break;
-        }
-        case 3:
-        case 5:
-        {
-            switch(input->data_type())
+            case 3:
+            case 5:
             {
-                case DataType::F32:
-                    num_weight_elems_read_per_row   = 4 + kernel_size - 1;
-                    num_elems_read_per_iteration    = 12;
-                    num_elems_written_per_iteration = 16 >> conv_stride_x;
-                    break;
+                switch(input->data_type())
+                {
+                    case DataType::F32:
+                        num_weight_elems_read_per_row   = 4 + kernel_size - 1;
+                        num_elems_read_per_iteration    = 12;
+                        num_elems_written_per_iteration = 16 >> conv_stride_x;
+                        break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
+                    case DataType::F16:
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                case DataType::QS8:
-                case DataType::QS16:
-                    num_weight_elems_read_per_row   = 8 + kernel_size - 1;
-                    num_elems_read_per_iteration    = 24;
-                    num_elems_written_per_iteration = 32 >> conv_stride_x;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Data type not supported.");
-                    break;
+                    case DataType::QS8:
+                    case DataType::QS16:
+                        num_weight_elems_read_per_row   = 8 + kernel_size - 1;
+                        num_elems_read_per_iteration    = 24;
+                        num_elems_written_per_iteration = 32 >> conv_stride_x;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Data type not supported.");
+                        break;
+                }
+            }
+            break;
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not implemented");
+                break;
             }
         }
-        break;
-        default:
-        {
-            ARM_COMPUTE_ERROR("Not implemented");
-            break;
-        }
+
+        // Calculate right pad
+        int start_x       = kernel_size / 2 - static_cast<int>(conv_info.pad_left());
+        int end_x         = ceil_to_multiple(static_cast<int>(output->dimension(0)), num_elems_written_per_iteration) * conv_stride_x;
+        int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width;
+
+        // Calculate border
+        const unsigned int conv_pad_left   = conv_info.pad_left();
+        const unsigned int conv_pad_top    = conv_info.pad_top();
+        const unsigned int conv_pad_right  = std::max(upper_bound_w, 0);
+        const unsigned int conv_pad_bottom = conv_info.pad_bottom();
+
+        border_size.left   = conv_pad_left;
+        border_size.top    = conv_pad_top;
+        border_size.right  = conv_pad_right;
+        border_size.bottom = conv_pad_bottom;
+
+        // Configure window
+        win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
+
+        AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top,
+                                           num_elems_read_per_iteration, kernel_size,
+                                           conv_stride_x, conv_stride_y);
+        AccessWindowStatic     weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
+        AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
+        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
     }
+    else
+    {
+        border_size.left   = 0;
+        border_size.top    = conv_info.pad_left();
+        border_size.right  = 0;
+        border_size.bottom = conv_info.pad_right();
 
-    // Calculate right pad
-    int start_x       = kernel_size / 2 - static_cast<int>(conv_info.pad_left());
-    int end_x         = ceil_to_multiple(static_cast<int>(output->dimension(0)), num_elems_written_per_iteration) * conv_stride_x;
-    int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width;
+        num_elems_read_per_iteration = 16 / element_size_from_data_type(input->data_type());
 
-    // Calculate border
-    const unsigned int conv_pad_left   = conv_info.pad_left();
-    const unsigned int conv_pad_top    = conv_info.pad_top();
-    const unsigned int conv_pad_right  = std::max(upper_bound_w, 0);
-    const unsigned int conv_pad_bottom = conv_info.pad_bottom();
+        win = calculate_max_window(*output, Steps());
 
-    border_size.left   = conv_pad_left;
-    border_size.top    = conv_pad_top;
-    border_size.right  = conv_pad_right;
-    border_size.bottom = conv_pad_bottom;
-
-    // Configure window
-    Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
-
-    AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top,
-                                       num_elems_read_per_iteration, kernel_size,
-                                       conv_stride_x, conv_stride_y);
-    AccessWindowStatic     weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
-    AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
-    bool                   window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+        AccessWindowRectangle input_access(input, 0, -border_size.top, num_elems_read_per_iteration, kernel_size, 1.f, conv_stride_x);
+        AccessWindowRectangle weights_access(weights, 0, 0, num_elems_read_per_iteration, kernel_size);
+        window_changed = update_window_and_padding(win, input_access, weights_access);
+    }
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
@@ -1170,7 +1302,7 @@
     _weights     = weights;
     _output      = output;
     _conv_info   = conv_info;
-    _kernel_size = weights->info()->dimension(0);
+    _kernel_size = weights->info()->dimension(get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::WIDTH));
 
     const unsigned int conv_pad_left   = conv_info.pad_left();
     const unsigned int conv_pad_top    = conv_info.pad_top();
@@ -1179,7 +1311,7 @@
     _border_size                       = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
 
     // Get convolved dimensions
-    TensorShape output_shape = get_convolved_dimensions(input->info(), weights->info(), _kernel_size, conv_info);
+    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info);
 
     DataType data_type = input->info()->data_type();
 
@@ -1229,73 +1361,88 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
 
-    const int kernel_size = _weights->info()->dimension(0);
+    const int kernel_size = _weights->info()->dimension(get_data_layout_dimension_index(_weights->info()->data_layout(), DataLayoutDimension::WIDTH));
 
-    switch(kernel_size)
+    if(_input->info()->data_layout() == DataLayout::NCHW)
     {
-        case 1:
+        switch(kernel_size)
         {
-            switch(_input->info()->data_type())
+            case 1:
             {
-                case DataType::QS8:
-                    convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
-                    break;
-                case DataType::QS16:
-                    convolve_1x1<qint16_t, qint32_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
-                    break;
-                case DataType::F32:
-                    convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
-                    break;
+                switch(_input->info()->data_type())
+                {
+                    case DataType::QS8:
+                        convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                        break;
+                    case DataType::QS16:
+                        convolve_1x1<qint16_t, qint32_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                        break;
+                    case DataType::F32:
+                        convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                        break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
-                    break;
+                    case DataType::F16:
+                        convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                        break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                default:
-                    ARM_COMPUTE_ERROR("Data type not supported");
-                    break;
+                    default:
+                        ARM_COMPUTE_ERROR("Data type not supported");
+                        break;
+                }
+                break;
             }
-            break;
-        }
-        case 3:
-        {
-            switch(_input->info()->data_type())
+            case 3:
             {
-                case DataType::QS8:
-                    convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
-                    break;
-                case DataType::F32:
-                    convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
-                    break;
+                switch(_input->info()->data_type())
+                {
+                    case DataType::QS8:
+                        convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                        break;
+                    case DataType::F32:
+                        convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                        break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
-                    break;
+                    case DataType::F16:
+                        convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                        break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                default:
-                    ARM_COMPUTE_ERROR("Data type not supported");
-                    break;
+                    default:
+                        ARM_COMPUTE_ERROR("Data type not supported");
+                        break;
+                }
+                break;
             }
-            break;
-        }
-        case 5:
-        {
-            switch(_input->info()->data_type())
+            case 5:
             {
-                case DataType::F32:
-                    convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Data type not supported");
-                    break;
+                switch(_input->info()->data_type())
+                {
+                    case DataType::F32:
+                        convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Data type not supported");
+                        break;
+                }
+                break;
             }
-            break;
-        }
 
-        default:
+            default:
+            {
+                ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");
+                break;
+            }
+        }
+    }
+    else
+    {
+        switch(_input->info()->data_type())
         {
-            ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");
-            break;
+            case DataType::F32:
+                convolver_nhwc<float>::convolve(window, kernel_size, _num_elems_read_per_iteration, _input, _weights, _output, _conv_info);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
         }
     }
 }

diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 08d8f8c..edda2cd 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp

@@ -44,6 +44,7 @@
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8,
                                                          DataType::QS16, DataType::F16,
                                                          DataType::QS32, DataType::S32, DataType::F32);
@@ -68,6 +69,7 @@
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
         }
 
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)));
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
     }
     else
@@ -79,6 +81,8 @@
     if((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
         if(is_data_type_fixed_point(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && output->data_type() != DataType::QS8, "Wrong data type for output");
@@ -101,6 +105,8 @@
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
 {
+    ARM_COMPUTE_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+
     bool         window_changed                    = false;
     unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->data_type());
 
@@ -138,8 +144,16 @@
         }
         else
         {
-            AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
-            window_changed = update_window_and_padding(win, input_access, bias_access);
+            if(input->data_layout() == DataLayout::NCHW)
+            {
+                AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
+                window_changed = update_window_and_padding(win, input_access, bias_access);
+            }
+            else
+            {
+                AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration);
+                window_changed = update_window_and_padding(win, input_access, bias_access);
+            }
         }
 
         input_access.set_valid_region(win, ValidRegion(Coordinates(), input->tensor_shape()));
@@ -253,6 +267,7 @@
 void output_stage(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
 {
+    ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
     ARM_COMPUTE_UNUSED(result_shift);
     ARM_COMPUTE_UNUSED(result_offset_after_shift);
@@ -303,6 +318,66 @@
     }
 }
 
+template <typename T1, typename T2, bool in_place, bool has_bias>
+void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
+    ARM_COMPUTE_UNUSED(result_shift);
+    ARM_COMPUTE_UNUSED(result_offset_after_shift);
+
+    Window window_bias = window;
+    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    window_bias.set(3, Window::Dimension(0, 0, 0));
+
+    Iterator in(input, window);
+    Iterator bi(bias, window_bias);
+
+    if(in_place) // In place accumulate
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr   = reinterpret_cast<T1 *>(in.ptr());
+            const auto bias_ptr = reinterpret_cast<T2 *>(bi.ptr());
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                internal_vst1q(in_ptr, internal_vqaddq(internal_vld1q(in_ptr), internal_vld1q(bias_ptr)));
+            }
+            else
+            {
+                internal_vst1q(in_ptr, internal_vld1q(in_ptr));
+            }
+        },
+        in, bi);
+    }
+    else // Out of place accumulate
+    {
+        Iterator out(output, window);
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr   = reinterpret_cast<T1 *>(in.ptr());
+            const auto out_ptr  = reinterpret_cast<T2 *>(out.ptr());
+            const auto bias_ptr = reinterpret_cast<T2 *>(bi.ptr());
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                internal_vst1q(out_ptr, internal_vqaddq(internal_vld1q(in_ptr), internal_vld1q(bias_ptr)));
+            }
+            else
+            {
+                internal_vst1q(out_ptr, internal_vld1q(in_ptr));
+            }
+        },
+        in, bi);
+    }
+}
+
 // QASYMM8 specializations
 template <>
 void output_stage<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
@@ -415,61 +490,79 @@
     INEKernel::configure(win_config.second);
 
     // Set appropriate function
-    switch(input->info()->data_type())
+    if(input->info()->data_layout() == DataLayout::NCHW)
     {
-        case DataType::QS8:
+        switch(input->info()->data_type())
         {
-            if(bias == nullptr)
+            case DataType::QS8:
             {
-                _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, false> : &output_stage<qint8_t, qint8_t, false, false>;
+                if(bias == nullptr)
+                {
+                    _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, false> : &output_stage<qint8_t, qint8_t, false, false>;
+                }
+                else
+                {
+                    _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, true> : &output_stage<qint8_t, qint8_t, false, true>;
+                }
+                break;
             }
-            else
+            case DataType::QS16:
             {
-                _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, true> : &output_stage<qint8_t, qint8_t, false, true>;
+                if(bias != nullptr && bias->info()->data_type() == DataType::QS8)
+                {
+                    _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, true> : &output_stage<qint16_t, qint8_t, false, true>;
+                }
+                else if(bias == nullptr)
+                {
+                    _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, false> : &output_stage<qint16_t, qint8_t, false, false>;
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Not implemented");
+                }
+                break;
             }
-            break;
-        }
-        case DataType::QS16:
-        {
-            if(bias != nullptr && bias->info()->data_type() == DataType::QS8)
+            case DataType::QS32:
             {
-                _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, true> : &output_stage<qint16_t, qint8_t, false, true>;
+                _func = (output == nullptr) ? &output_stage<qint32_t, qint16_t, true, true> : &output_stage<qint32_t, qint16_t, false, true>;
+                break;
             }
-            else if(bias == nullptr)
+            case DataType::S32:
             {
-                _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, false> : &output_stage<qint16_t, qint8_t, false, false>;
+                _func = (bias == nullptr) ? &output_stage<int32_t, uint8_t, false, false> : &output_stage<int32_t, uint8_t, false, true>;
+                break;
             }
-            else
-            {
-                ARM_COMPUTE_ERROR("Not implemented");
-            }
-            break;
-        }
-        case DataType::QS32:
-        {
-            _func = (output == nullptr) ? &output_stage<qint32_t, qint16_t, true, true> : &output_stage<qint32_t, qint16_t, false, true>;
-            break;
-        }
-        case DataType::S32:
-        {
-            _func = (bias == nullptr) ? &output_stage<int32_t, uint8_t, false, false> : &output_stage<int32_t, uint8_t, false, true>;
-            break;
-        }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            _func = (output == nullptr) ? &output_stage<float16_t, float16_t, true, true> : &output_stage<float16_t, float16_t, false, true>;
-            break;
-        }
+            case DataType::F16:
+            {
+                _func = (output == nullptr) ? &output_stage<float16_t, float16_t, true, true> : &output_stage<float16_t, float16_t, false, true>;
+                break;
+            }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
-        {
-            _func = (output == nullptr) ? &output_stage<float, float, true, true> : &output_stage<float, float, false, true>;
-            break;
+            case DataType::F32:
+            {
+                _func = (output == nullptr) ? &output_stage<float, float, true, true> : &output_stage<float, float, false, true>;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+            }
         }
-        default:
+    }
+    else
+    {
+        switch(input->info()->data_type())
         {
-            ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+            case DataType::F32:
+            {
+                _func = (output == nullptr) ? &output_stage_nhwc<float, float, true, true> : &output_stage_nhwc<float, float, false, true>;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+            }
         }
     }
 }

diff --git a/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp b/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp
deleted file mode 100644
index 768dd8b..0000000
--- a/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp
+++ /dev/null

@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute;
-
-namespace
-{
-TensorShape get_output_shape(const ITensorInfo *input, unsigned int block_height)
-{
-    TensorShape output_shape      = input->tensor_shape();
-    const float interleave_by_f32 = block_height;
-    output_shape.set(0, input->dimension(0) * interleave_by_f32);
-    output_shape.set(1, std::ceil(static_cast<float>(input->dimension(1)) / interleave_by_f32));
-    return output_shape;
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_width, unsigned int block_height)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_height < 1, "Block height must be greater than 0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_width < 1, "Block window must be greater than 0");
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, block_height));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int block_width, unsigned int block_height)
-{
-    const unsigned int num_elems_processed_per_iteration_x = block_width;
-    const unsigned int num_elems_processed_per_iteration_y = block_height;
-    bool               window_changed                      = false;
-
-    // Configure kernel window
-    Window      win           = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    const float scaley_factor = 1.f / block_height;
-
-    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-    window_changed = window_changed || update_window_and_padding(win, input_access);
-
-    // Configure window in case of configured output
-    if(output->total_size() != 0)
-    {
-        AccessWindowRectangle output_access(output,
-                                            0, 0,
-                                            num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y,
-                                            1, num_elems_processed_per_iteration_y, scaley_factor);
-        window_changed = window_changed || update_window_and_padding(win, output_access);
-        output_access.set_valid_region(win, input->valid_region());
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-inline void gemm_interleave_blocked_transposed_8bit(const ITensor *input, ITensor *output, const Window &window, unsigned int block_width, unsigned int block_height)
-{
-    const size_t in_stride = input->info()->strides_in_bytes()[1];
-
-    const unsigned int in_height = input->info()->dimension(1);
-    const unsigned int in_width  = input->info()->dimension(0);
-
-    const float scale_y_factor = 1.f / float(block_height);
-
-    // Set window for output tensor
-    Window win_out(window);
-    win_out.scale(Window::DimY, scale_y_factor);
-    Iterator in(input, window);
-
-    win_out.set_dimension_step(Window::DimX, block_width * block_height);
-    Iterator out(output, win_out);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        std::fill_n(out.ptr(), block_width * block_height, 0);
-    },
-    out);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        for(unsigned int z = id.y(); (z < in_width) && z < (id.y() + block_height); ++z)
-        {
-            int j = (z - id.y()) * block_width;
-            for(unsigned int b = id.x(); (b < in_height) && (b < (id.x() + block_width)); ++b)
-            {
-                *(out.ptr() + j++) = *(input->buffer() + b * in_stride + z);
-            }
-        }
-    },
-    in, out);
-}
-
-inline void gemm_interleave_blocked_8bit(const ITensor *input, ITensor *output, const Window &window, unsigned int block_width, unsigned int block_height)
-{
-    const size_t in_stride = input->info()->strides_in_bytes()[1];
-
-    const unsigned int in_height = input->info()->dimension(1);
-    const unsigned int in_width  = input->info()->dimension(0);
-
-    const float scale_y_factor = 1.f / float(block_height);
-
-    // Set window for output tensor
-    Window win_out(window);
-    win_out.scale(Window::DimY, scale_y_factor);
-    Iterator in(input, window);
-
-    win_out.set_dimension_step(Window::DimX, block_width * block_height);
-    Iterator out(output, win_out);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        std::fill_n(out.ptr(), block_width * block_height, 0);
-    },
-    out);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        for(unsigned int z = id.y(); (z < in_height) && z < (id.y() + block_height); ++z)
-        {
-            int j = (z - id.y()) * block_width;
-            for(unsigned int b = id.x(); (b < in_width) && (b < (id.x() + block_width)); ++b)
-            {
-                *(out.ptr() + j++) = *(input->buffer() + z * in_stride + b);
-            }
-        }
-    },
-    in, out);
-}
-} // namespace
-
-NEGEMMInterleaveBlockedKernel::NEGEMMInterleaveBlockedKernel()
-    : _block_height(0), _block_width(0), _transpose(false)
-{
-}
-
-void NEGEMMInterleaveBlockedKernel::configure(const ITensor *input, ITensor *output, unsigned int block_height, unsigned int block_width, bool transpose)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), get_output_shape(input->info(), block_height), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_width, block_height));
-
-    _input        = input;
-    _output       = output;
-    _block_height = block_height;
-    _block_width  = block_width;
-    _transpose    = transpose;
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), block_width, block_height);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-}
-
-Status NEGEMMInterleaveBlockedKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_height, unsigned int block_width, bool transpose)
-{
-    ARM_COMPUTE_UNUSED(transpose);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_width, block_height));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), block_width, block_height).first);
-
-    return Status{};
-}
-
-void NEGEMMInterleaveBlockedKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    if(_transpose)
-    {
-        gemm_interleave_blocked_transposed_8bit(_input, _output, window, _block_width, _block_height);
-    }
-    else
-    {
-        gemm_interleave_blocked_8bit(_input, _output, window, _block_width, _block_height);
-    }
-}

diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
index 3fd81be..c204395 100644
--- a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -764,7 +764,7 @@
                               AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
                               output_access);
 
-    output_access.set_valid_region(win, input->info()->valid_region());
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     INEKernel::configure(win);
 }
@@ -786,7 +786,7 @@
 
     Window win_in(window);
     win_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width);
-    win_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height);
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
 
     Iterator in(_input, win_in);
     Iterator out(_output, window);
@@ -794,7 +794,7 @@
     // Normalises blocks
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        const auto input_row_ptr = reinterpret_cast<const float *>(in.ptr());
+        const auto input_row_ptr = reinterpret_cast<const float *>(in.ptr() + id.y() * _num_cells_per_block_stride.height * _input->info()->strides_in_bytes()[Window::DimY]);
         const auto out_row_ptr   = reinterpret_cast<float *>(out.ptr());
 
         // Execute normalization function

diff --git a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
index 343b051..2c02ab8 100644
--- a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,8 +81,8 @@
 
     // Configure kernel window
     Window win;
-    win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x), window_step_x));
-    win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y), window_step_y));
+    win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x) + window_step_x, window_step_x));
+    win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y) + window_step_y, window_step_y));
 
     constexpr unsigned int num_elems_read_per_iteration = 1;
     const unsigned int     num_rows_read_per_iteration  = _num_blocks_per_descriptor_y;

diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 4fa329b..86e3fd7 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp

@@ -45,34 +45,34 @@
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                          bool has_bias, bool is_fully_connected, bool is_flatten)
+                          bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
 
+    TensorShape expected_output_shape;
     if(is_flatten) /* Called by FlattenLayer */
     {
-        size_t flatten_shape = input->tensor_shape().x() * input->tensor_shape().y() * input->tensor_shape().z();
-        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != flatten_shape);
+        expected_output_shape = misc::shape_calculator::compute_im2col_flatten_shape(input);
     }
     else if(!is_fully_connected) /* Called by ConvolutionLayer */
     {
-        std::pair<unsigned int, unsigned int> out_dims = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_dims.width, kernel_dims.height, conv_info);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (input->dimension(2) * kernel_dims.area() + (has_bias ? 1 : 0)));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != (out_dims.first * out_dims.second));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(2) != 1);
+        expected_output_shape = misc::shape_calculator::compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation);
     }
     else /* Called by FullyConnectedLayer */
     {
         const int num_batch_dimensions = std::max(0, static_cast<int>(output->tensor_shape().num_dimensions()) - 1);
         const int num_input_dimensions = input->tensor_shape().num_dimensions() - num_batch_dimensions;
 
-        TensorInfo expected_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_im2col_shape(input, num_input_dimensions));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
+        expected_output_shape = misc::shape_calculator::compute_im2col_fc_shape(input, num_input_dimensions);
     }
 
+    TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
+
     return Status{};
 }
 
@@ -91,11 +91,13 @@
                              int                  input_stride_y,
                              int                  input_stride_z,
                              int                  fixed_point_position,
-                             int                  pad_value)
+                             int                  pad_value,
+                             int                  dilation_x,
+                             int                  dilation_y)
 {
     const int kernel_size2 = kernel_width * kernel_height;
-    const int x_e          = top_left_x + kernel_width;
-    const int y_e          = top_left_y + kernel_height;
+    const int x_e          = top_left_x + kernel_width * dilation_x;
+    const int y_e          = top_left_y + kernel_height * dilation_y;
 
     // Linearize volume
     int d = 0;
@@ -104,12 +106,12 @@
     // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
     for(; d <= (kernel_depth - 3); d += 3)
     {
-        for(int y = top_left_y; y < y_e; ++y)
+        for(int y = top_left_y; y < y_e; y += dilation_y)
         {
             if((y < 0 || y >= input_h) && has_pads)
             {
                 // All the values will be the offset (will be zeros when not quantized)
-                for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
                 {
                     *(out_ptr + 0 * kernel_size2) = pad_value;
                     *(out_ptr + 1 * kernel_size2) = pad_value;
@@ -118,7 +120,7 @@
             }
             else
             {
-                for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
                 {
                     if((x < 0 || x >= input_w) && has_pads)
                     {
@@ -141,7 +143,7 @@
     // Left over
     for(; d < kernel_depth; d++)
     {
-        for(int y = top_left_y; y < y_e; ++y)
+        for(int y = top_left_y; y < y_e; y += dilation_y)
         {
             if((y < 0 || y >= input_h) && has_pads)
             {
@@ -151,7 +153,7 @@
             }
             else
             {
-                for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
                 {
                     if((x < 0 || x >= input_w) && has_pads)
                     {
@@ -191,12 +193,17 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const int kernel_depth   = _input->info()->dimension(2);
-    const int input_w        = _input->info()->dimension(0);
-    const int input_h        = _input->info()->dimension(1);
-    const int input_stride_x = _input->info()->strides_in_bytes().x();
-    const int input_stride_y = _input->info()->strides_in_bytes().y();
-    const int input_stride_z = _input->info()->strides_in_bytes().z();
+    const DataLayout   data_layout = _input->info()->data_layout();
+    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    const int kernel_depth   = _input->info()->dimension(channel_idx);
+    const int input_w        = _input->info()->dimension(width_idx);
+    const int input_h        = _input->info()->dimension(height_idx);
+    const int input_stride_x = _input->info()->strides_in_bytes()[width_idx];
+    const int input_stride_y = _input->info()->strides_in_bytes()[height_idx];
+    const int input_stride_z = _input->info()->strides_in_bytes()[channel_idx];
     const int offset         = is_data_type_quantized(_input->info()->data_type()) ? _input->info()->quantization_info().offset : 0;
 
     int pad_left = 0;
@@ -211,30 +218,24 @@
     const int start_x = -pad_left;
     const int start_y = -pad_top;
 
-    Window window_in(window);
-    // The first three dimensions of the input are increased by the inner loops
-    window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    // Setup output window
-    Window window_out(window);
-    window_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _output->info()->strides_in_bytes().y() / _output->info()->element_size()));
-    window_out.set(Window::DimY, Window::Dimension(window.y().start() * _convolved_dims.first, window.y().end() * _convolved_dims.first, _convolved_dims.first));
-    window_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
+    Window window_in_out(window);
+    // The first three dimensions of the input and output are increased by the inner loops
+    window_in_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
     // Create iterators
-    Iterator in(_input, window_in);
-    Iterator out(_output, window_out);
+    Iterator in(_input, window_in_out);
+    Iterator out(_output, window_in_out);
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        const int top_left_x = id.x() * stride_x + start_x;
-        const int top_left_y = id.y() * stride_y + start_y;
+        const int top_left_x = id[width_idx] * stride_x + start_x;
+        const int top_left_y = id[height_idx] * stride_y + start_y;
 
         // Get pointers
         const uint8_t *const input_ptr  = in.ptr();
-        auto                 output_ptr = reinterpret_cast<T *>(out.ptr());
+        auto                 output_ptr = reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * _output->info()->strides_in_bytes().y());
 
         // Linearize volume
         linearize_volume<T, has_pads>(input_ptr,
@@ -251,7 +252,9 @@
                                       input_stride_y,
                                       input_stride_z,
                                       _input->info()->fixed_point_position(),
-                                      offset);
+                                      offset,
+                                      _dilation.x(),
+                                      _dilation.y());
     },
     in, out);
 }
@@ -309,28 +312,33 @@
 }
 
 NEIm2ColKernel::NEIm2ColKernel()
-    : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false)
+    : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U)
 {
 }
 
 void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                               bool has_bias, bool is_fully_connected, bool is_flatten)
+                               bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Perform validation step
-    ARM_COMPUTE_UNUSED(is_fully_connected);
-    ARM_COMPUTE_UNUSED(is_flatten);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten));
+    ARM_COMPUTE_UNUSED(is_fully_connected, is_flatten);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten, dilation));
+
+    const DataLayout   data_layout = input->info()->data_layout();
+    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
     _input          = input;
     _output         = output;
     _conv_info      = conv_info;
     _kernel_width   = kernel_dims.width;
-    _kernel_height  = kernel_dims.height,
-    _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
+    _kernel_height  = kernel_dims.height;
+    _dilation       = dilation;
+    _convolved_dims = scaled_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx),
                                         _kernel_width, _kernel_height,
-                                        _conv_info);
+                                        _conv_info, _dilation);
     _has_bias = has_bias;
 
     unsigned int stride_x = 0;
@@ -341,7 +349,8 @@
                                && (std::equal(input->info()->tensor_shape().cbegin() + 3,
                                               input->info()->tensor_shape().cend(),
                                               output->info()->tensor_shape().cbegin() + 1))
-                               && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding());
+                               && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding())
+                               && ((dilation.x() == 1) && (dilation.y() == 1));
 
     Window window = calculate_max_window(*input->info(), Steps());
 
@@ -396,9 +405,9 @@
                 ARM_COMPUTE_ERROR("Data type not supported");
                 break;
         }
-        window.set(Window::DimX, Window::Dimension(0, _convolved_dims.first, 1));
-        window.set(Window::DimY, Window::Dimension(0, _convolved_dims.second, 1));
-        window.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        window.set(width_idx, Window::Dimension(0, _convolved_dims.first, 1));
+        window.set(height_idx, Window::Dimension(0, _convolved_dims.second, 1));
+        window.set(channel_idx, Window::Dimension(0, 1, 1));
     }
 
     // The NEIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
@@ -408,9 +417,9 @@
 }
 
 Status NEIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                bool has_bias, bool is_fully_connected, bool is_flatten)
+                                bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten, dilation));
     return Status{};
 }
 

diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index 3bf1d940..91776d8 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,6 +67,55 @@
     }
     while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
 }
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, unsigned int axis, float epsilon)
+{
+    ARM_COMPUTE_UNUSED(epsilon);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported normalization axis, Supported axis is 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Normalization axis greater than max number of dimensions");
+
+    // Reduce shape on axis
+    TensorShape sum_shape = input->tensor_shape();
+    sum_shape.set(axis, 1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
+    }
+
+    return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *sum, ITensorInfo *output, unsigned int axis)
+{
+    const unsigned int num_elems_processed_per_iteration     = 16 / data_size_from_type(input->data_type());
+    const unsigned int num_elems_processed_per_iteration_sum = (axis == 0) ? 1 : num_elems_processed_per_iteration;
+
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type(), input->fixed_point_position());
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal sum_access(sum, 0, num_elems_processed_per_iteration_sum);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, sum_access, output_access);
+    output_access.set_valid_region(win, input->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+
+    return std::make_tuple(err, win);
+}
 } // namespace
 
 NEL2NormalizeLayerKernel::NEL2NormalizeLayerKernel()
@@ -77,18 +126,7 @@
 void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, unsigned int axis, float epsilon)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
-    ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Normalization axis greater than max number of dimensions");
-    ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported normalization axis, Supported axis is 0");
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
-    unsigned int num_elems_processed_per_iteration     = 16 / data_size_from_type(input->info()->data_type());
-    unsigned int num_elems_processed_per_iteration_sum = (axis == 0) ? 1 : num_elems_processed_per_iteration;
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
 
     _input   = input;
     _sum     = sum;
@@ -97,16 +135,18 @@
     _epsilon = epsilon;
 
     // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal sum_access(sum->info(), 0, num_elems_processed_per_iteration_sum);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    auto win_config = validate_and_configure_window(_input->info(), _sum->info(), _output->info(), axis);
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
-    update_window_and_padding(win, input_access, sum_access, output_access);
+    INEKernel::configure(std::get<1>(win_config));
+}
 
-    output_access.set_valid_region(win, input->info()->valid_region());
+Status NEL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, unsigned int axis, float epsilon)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), sum->clone().get(), output->clone().get(), axis)));
 
-    INEKernel::configure(win);
+    return Status{};
 }
 
 void NEL2NormalizeLayerKernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NELKTrackerKernel.cpp b/src/core/NEON/kernels/NELKTrackerKernel.cpp
index 004ecd0..83593e7 100644
--- a/src/core/NEON/kernels/NELKTrackerKernel.cpp
+++ b/src/core/NEON/kernels/NELKTrackerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -356,13 +356,16 @@
     _termination          = termination;
     _use_initial_estimate = use_initial_estimate;
     _epsilon              = epsilon;
-    _num_iterations       = num_iterations;
     _window_dimension     = window_dimension;
     _level                = level;
     _num_levels           = num_levels;
     _pyramid_scale        = pyramid_scale;
     _num_levels           = num_levels;
 
+    // Set maximum number of iterations used for convergence
+    const size_t max_iterations = 1000;
+    _num_iterations             = (termination == Termination::TERM_CRITERIA_EPSILON) ? max_iterations : num_iterations;
+
     Window window;
     window.set(Window::DimX, Window::Dimension(0, old_points->num_values()));
     window.set(Window::DimY, Window::Dimension(0, 1));
@@ -471,7 +474,7 @@
         float prev_delta_x = 0.0f;
         float prev_delta_y = 0.0f;
 
-        for(unsigned int j = 0; j < _num_iterations || _termination == Termination::TERM_CRITERIA_EPSILON; ++j)
+        for(unsigned int j = 0; j < _num_iterations; ++j)
         {
             if(is_invalid_keypoint(new_keypoint))
             {

diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
index 58da040..099626d 100644
--- a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -186,7 +186,7 @@
     win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
 
     Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
 
     Iterator ina(input0, win_a);
     Iterator out(output, win_out);
@@ -234,7 +234,7 @@
             asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
             asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
             asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
-#endif /* __arm __ */
+#endif /* __arm__ */
 
             acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
             acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
@@ -302,6 +302,37 @@
     },
     ina, out);
 }
+
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+
+    return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+{
+    const unsigned int num_elems_processed_per_iteration_x = 16;
+
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x));
+
+    AccessWindowHorizontal input0_access(input0, 0, num_elems_processed_per_iteration_x);
+    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration_x);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x);
+
+    bool window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+
+    return std::make_tuple(err, win);
+}
 } // namespace
 
 NELocallyConnectedMatrixMultiplyKernel::NELocallyConnectedMatrixMultiplyKernel()
@@ -311,31 +342,27 @@
 
 void NELocallyConnectedMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
 
     _input0 = input0;
     _input1 = input1;
     _output = output;
 
-    const unsigned int num_elems_processed_per_iteration_x = 16;
-
     // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
 
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration_x);
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input0->info(), 0, num_elems_processed_per_iteration_x),
-                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration_x),
-                              output_access);
+    INEKernel::configure(std::get<1>(win_config));
+}
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+Status NELocallyConnectedMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get())));
 
-    INEKernel::configure(win);
+    return Status{};
 }
 
 void NELocallyConnectedMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
index 01be36b..434f4eb 100644
--- a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp

@@ -32,14 +32,60 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <algorithm>
 #include <arm_neon.h>
 #include <climits>
 #include <cstddef>
 
+using namespace arm_compute::misc::shape_calculator;
+
 namespace arm_compute
 {
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
+
+    if(output->tensor_shape().total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+        TensorShape output_shape = compute_min_max_shape(input);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+    }
+
+    return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    TensorShape output_shape = compute_min_max_shape(input);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->fixed_point_position());
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, 2);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_tuple(err, win);
+}
+} // namespace
+
 NEMinMaxLayerKernel::NEMinMaxLayerKernel()
     : _input(nullptr), _output(nullptr), _mtx()
 {
@@ -47,36 +93,25 @@
 
 void NEMinMaxLayerKernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
-
-    TensorShape output_shape{ input->info()->tensor_shape() };
-    output_shape.set(Window::DimX, 2);
-    output_shape.remove_dimension(1);
-    output_shape.remove_dimension(1);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input  = input;
     _output = output;
 
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
+    auto win_config = validate_and_configure_window(input->info(), output->info());
 
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, 2);
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
-    update_window_and_padding(win, input_access, output_access);
+    INEKernel::configure(std::get<1>(win_config));
+}
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+Status NEMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
 
-    INEKernel::configure(win);
+    return Status{};
 }
 
 void NEMinMaxLayerKernel::run(const Window &window, const ThreadInfo &info)
@@ -160,7 +195,7 @@
 
     float32x2_t reset_values = vdup_n_f32(0.0f);
     reset_values             = vset_lane_f32(std::numeric_limits<float>::max(), reset_values, 0);
-    reset_values             = vset_lane_f32(std::numeric_limits<float>::min(), reset_values, 1);
+    reset_values             = vset_lane_f32(std::numeric_limits<float>::lowest(), reset_values, 1);
 
     Window window_output;
     window_output.use_tensor_dimensions(_output->info()->tensor_shape());

diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
index ad66acd..b90e813 100644
--- a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/utility.h"
+#include "arm_compute/core/utils/misc/Utility.h"
 
 #include <algorithm>
 #include <arm_neon.h>

diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index c271032..193ca37 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,18 +54,23 @@
 const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
 const float32x4_t positive_round_f32q    = vdupq_n_f32(0.5f);
 
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
 inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
     ARM_COMPUTE_UNUSED(overflow_policy);
     ARM_COMPUTE_UNUSED(rounding_policy);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
                                     "Output can only be U8 if both inputs are U8");
 
+    const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
     if(is_data_type_fixed_point(input1->data_type()) || is_data_type_fixed_point(input2->data_type()) || is_data_type_fixed_point(output->data_type()))
     {
         // Check that all data types are the same and all fixed-point positions are the same
@@ -96,19 +101,44 @@
 
 inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
 {
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
+    const ValidRegion &valid_region = broadcast_pair.second;
+
+    // Auto initialize output if not initialized
+    {
+        set_shape_if_empty(*output, input1->tensor_shape());
+
+        if(input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
+        {
+            set_format_if_unknown(*output, Format::S16);
+        }
+        else if(input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
+        {
+            set_format_if_unknown(*output, Format::F32);
+        }
+        else if(input1->data_type() == DataType::F16 || input2->data_type() == DataType::F16)
+        {
+            set_format_if_unknown(*output, Format::F16);
+        }
+        else if(input1->data_type() == DataType::QS8 && input2->data_type() == DataType::QS8)
+        {
+            set_data_type_if_unknown(*output, DataType::QS8);
+            set_fixed_point_position_if_zero(*output, input1->fixed_point_position());
+        }
+    }
 
     // Configure kernel window
-    Window                 win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
+    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+    Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
+    Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
+
+    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
 
-    bool window_changed = update_window_and_padding(win,
-                                                    AccessWindowHorizontal(input1, 0, num_elems_processed_per_iteration),
-                                                    AccessWindowHorizontal(input2, 0, num_elems_processed_per_iteration),
-                                                    output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
-                                                       input2->valid_region());
+    bool window_changed = update_window_and_padding(win_input1, input1_access)
+                          || update_window_and_padding(win_input2, input2_access)
+                          || update_window_and_padding(win, output_access);
 
     output_access.set_valid_region(win, valid_region);
 
@@ -508,31 +538,12 @@
     ARM_COMPUTE_UNUSED(rounding_policy);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
-    // Auto initialize output if not initialized
-    {
-        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
-
-        if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
-        {
-            set_format_if_unknown(*output->info(), Format::S16);
-        }
-        else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
-        {
-            set_format_if_unknown(*output->info(), Format::F32);
-        }
-        else if(input1->info()->data_type() == DataType::F16 || input2->info()->data_type() == DataType::F16)
-        {
-            set_format_if_unknown(*output->info(), Format::F16);
-        }
-        else if(input1->info()->data_type() == DataType::QS8 && input2->info()->data_type() == DataType::QS8)
-        {
-            set_data_type_if_unknown(*output->info(), DataType::QS8);
-            set_fixed_point_position_if_zero(*output->info(), input1->info()->fixed_point_position());
-        }
-    }
-
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy));
 
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
     _input1         = input1;
     _input2         = input2;
     _output         = output;
@@ -656,15 +667,13 @@
         ARM_COMPUTE_ERROR("You called with the wrong img formats");
     }
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
 
 Status NEPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
                                                  RoundingPolicy rounding_policy)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
 
@@ -677,34 +686,71 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    Iterator input1(_input1, window);
-    Iterator input2(_input2, window);
-    Iterator output(_output, window);
+    const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+    const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+    const TensorShape &out_shape = _output->info()->tensor_shape();
+
+    bool can_collapse = true;
+    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+    {
+        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+        {
+            can_collapse = (in_shape1[d] == in_shape2[d]);
+        }
+    }
+
+    bool   has_collapsed = false;
+    Window collapsed     = can_collapse ? window.collapse_if_possible(INEKernel::window(), Window::DimZ, &has_collapsed) : window;
+
+    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+    Window slice        = collapsed.first_slice_window_3D();
+    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+    Iterator input1(_input1, slice_input1);
+    Iterator input2(_input2, slice_input2);
+    Iterator output(_output, slice);
 
     if(_func_int != nullptr)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(collapsed, [&](const Coordinates & id)
         {
             (*_func_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent);
+            collapsed.slide_window_slice_3D(slice_input1);
+            collapsed.slide_window_slice_3D(slice_input2);
         },
         input1, input2, output);
     }
     else if(_func_q_int != nullptr)
     {
         int fixed_point_position = _input1->info()->fixed_point_position();
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(collapsed, [&](const Coordinates & id)
         {
             (*_func_q_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent, fixed_point_position);
+            collapsed.slide_window_slice_3D(slice_input1);
+            collapsed.slide_window_slice_3D(slice_input2);
         },
         input1, input2, output);
     }
     else
     {
         ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(collapsed, [&](const Coordinates & id)
         {
             (*_func_float)(input1.ptr(), input2.ptr(), output.ptr(), _scale);
+            collapsed.slide_window_slice_3D(slice_input1);
+            collapsed.slide_window_slice_3D(slice_input2);
         },
         input1, input2, output);
     }
 }
+
+BorderSize NEPixelWiseMultiplicationKernel::border_size() const
+{
+    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+    return BorderSize(0, border, 0, 0);
+}
\ No newline at end of file

diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index b6af517..7877cf5 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp

@@ -53,20 +53,24 @@
 void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h)
 {
     TensorShape output_shape{ input->tensor_shape() };
-    output_shape.set(0, pooled_w);
-    output_shape.set(1, pooled_h);
+    output_shape.set(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH), pooled_w);
+    output_shape.set(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT), pooled_h);
 
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
 }
 
-template <bool exclude_padding>
+template <bool exclude_padding, DataLayout data_layout>
 inline float calculate_avg_scale(const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
-    int       start_x = id.x() * stride_x - pad_x;
-    int       start_y = id.y() * stride_y - pad_y;
-    const int end_x   = std::min(start_x + pool_size_x, upper_bound_w);
-    const int end_y   = std::min(start_y + pool_size_y, upper_bound_h);
+    const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    int start_x = id[idx_width] * stride_x - pad_x;
+    int start_y = id[idx_height] * stride_y - pad_y;
+
+    const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
+    const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
     if(exclude_padding)
     {
         start_x = std::max(0, start_x);
@@ -175,7 +179,9 @@
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
+                                    || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
     }
 
     return Status{};
@@ -193,12 +199,16 @@
                                                         BorderSize &border_size,
                                                         unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
 {
+    // Get data layout
+    DataLayout          data_layout                  = input->data_layout();
     unsigned int        num_elems_read_per_iteration = 0;
     unsigned int        num_elems_horizontal_window  = 0;
     int                 pool_stride_x                = 0;
     int                 pool_stride_y                = 0;
-    const int           input_width                  = input->dimension(0);
-    const int           input_height                 = input->dimension(1);
+    const int           idx_width                    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int           idx_height                   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int           input_width                  = input->dimension(idx_width);
+    const int           input_height                 = input->dimension(idx_height);
     const PadStrideInfo pad_stride_info              = pool_info.pad_stride_info();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
     const int  pool_pad_right  = pad_stride_info.pad_right();
@@ -206,18 +216,22 @@
     const int  pool_pad_left   = pad_stride_info.pad_left();
     const int  pool_pad_bottom = pad_stride_info.pad_bottom();
     const bool is_square       = pool_size_x == pool_size_y;
+
     // Check output dimensions
-    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
-                                                     input->dimension(1),
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
+                                                     input->dimension(idx_height),
                                                      pool_size_x,
                                                      pool_size_y,
                                                      pad_stride_info);
+    auto_init(input, output, pooled_w, pooled_h);
 
     //If it's not squared and optimized will be executed the MxN
     num_elems_read_per_iteration      = 1;
     num_elems_processed_per_iteration = 1;
     num_elems_horizontal_window       = 1;
 
+    const bool is_nhwc = data_layout == DataLayout::NHWC;
+
     if(is_square)
     {
         switch(input->data_type())
@@ -239,6 +253,11 @@
                 }
                 break;
             case DataType::QASYMM8:
+                if(is_nhwc)
+                {
+                    num_elems_processed_per_iteration = 8;
+                    break;
+                }
                 switch(pool_size_x)
                 {
                     case 2:
@@ -273,6 +292,11 @@
                 break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
+                if(is_nhwc)
+                {
+                    num_elems_processed_per_iteration = 8;
+                    break;
+                }
                 switch(pool_size_x)
                 {
                     case 2:
@@ -291,6 +315,11 @@
                 break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::F32:
+                if(is_nhwc)
+                {
+                    num_elems_processed_per_iteration = 4;
+                    break;
+                }
                 switch(pool_size_x)
                 {
                     case 2:
@@ -313,35 +342,61 @@
                 break;
         }
     }
-    // Number of iterations in X dimension
-    const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
-
-    // Upper limit for the number of right/bottom border elements that are accessed
-    const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
-    const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
-
-    border_size         = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
-    border_size.right   = std::max(upper_bound_w, pool_pad_right);
-    border_size.bottom  = std::max(upper_bound_h, pool_pad_bottom);
-    bool window_changed = false;
-
-    TensorShape output_shape{ input->tensor_shape() };
-    output_shape.set(0, pooled_w);
-    output_shape.set(1, pooled_h);
-    TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
-
-    Window             win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
-    AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
-
-    if(output->total_size() != 0)
+    else
     {
+        if(is_nhwc)
+        {
+            if(DataType::QASYMM8 == input->data_type())
+            {
+                num_elems_processed_per_iteration = 8;
+            }
+            else
+            {
+                num_elems_processed_per_iteration = 4;
+            }
+        }
+    }
+
+    bool   window_changed = false;
+    Window win{};
+    if(data_layout == DataLayout::NCHW)
+    {
+        // Number of iterations in X dimension
+        const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
+
+        // Upper limit for the number of right/bottom border elements that are accessed
+        const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - input_width;
+        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - input_height;
+
+        border_size        = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
+        border_size.right  = std::max(upper_bound_w, pool_pad_right);
+        border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
+
+        TensorShape output_shape{ input->tensor_shape() };
+        output_shape.set(0, pooled_w);
+        output_shape.set(1, pooled_h);
+        TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
+
+        win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
+        AccessWindowStatic input_access(input, -pool_pad_left, -pool_pad_top, input_width + border_size.right, input_height + border_size.bottom);
+
         AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
         window_changed = update_window_and_padding(win, input_access, output_access);
         output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
     }
     else
     {
-        window_changed = update_window_and_padding(win, input_access);
+        TensorShape output_shape{ input->tensor_shape() };
+        output_shape.set(1, pooled_w);
+        output_shape.set(2, pooled_h);
+        TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
+
+        win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
+        AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+        window_changed = update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
     }
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
@@ -368,18 +423,25 @@
     const bool          exclude_padding   = pool_info.exclude_padding();
     const bool          is_global_pooling = pool_info.is_global_pooling();
     const int           pool_stride_x     = pad_stride_info.stride().first;
+    unsigned int        pool_size_x       = 0;
+    unsigned int        pool_size_y       = 0;
+
+    // Get data layout
+    const DataLayout data_layout = input->info()->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Update pool size in case of global pooling
-    const int pool_size_x = is_global_pooling ? input->info()->dimension(0) : pool_info.pool_size().width;
-    const int pool_size_y = is_global_pooling ? input->info()->dimension(1) : pool_info.pool_size().height;
+    pool_size_x = is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size().width;
+    pool_size_y = is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size().height;
 
     // Validate pool info before calling scaled_dimensions
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size_x, pool_size_y));
 
     // Check output dimensions
     unsigned int pooled_w, pooled_h;
-    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
-                                                     input->info()->dimension(1),
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
+                                                     input->info()->dimension(idx_height),
                                                      pool_size_x,
                                                      pool_size_y,
                                                      pad_stride_info);
@@ -398,6 +460,7 @@
 
     // Get data type
     const DataType data_type = input->info()->data_type();
+    const bool     is_nchw   = data_layout == DataLayout::NCHW;
 
     // Select appropriate function
     if(data_type == DataType::QS8)
@@ -410,10 +473,10 @@
                     switch(pool_type)
                     {
                         case PoolingType::AVG:
-                            _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::AVG>;
+                            _func = &NEPoolingLayerKernel::pooling2_q8_nchw<PoolingType::AVG>;
                             break;
                         case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::pooling2_q8<PoolingType::MAX>;
+                            _func = &NEPoolingLayerKernel::pooling2_q8_nchw<PoolingType::MAX>;
                             break;
                         default:
                             ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -423,10 +486,10 @@
                     switch(pool_type)
                     {
                         case PoolingType::AVG:
-                            _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
+                            _func = &NEPoolingLayerKernel::pooling3_q8_nchw<PoolingType::AVG>;
                             break;
                         case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+                            _func = &NEPoolingLayerKernel::pooling3_q8_nchw<PoolingType::MAX>;
                             break;
                         default:
                             ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -436,7 +499,7 @@
                     switch(pool_type)
                     {
                         case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::poolingMxN_q8<PoolingType::MAX>;
+                            _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<PoolingType::MAX>;
                             break;
                         default:
                             ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -449,7 +512,7 @@
             switch(pool_type)
             {
                 case PoolingType::MAX:
-                    _func = &NEPoolingLayerKernel::poolingMxN_q8<PoolingType::MAX>;
+                    _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<PoolingType::MAX>;
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -463,10 +526,24 @@
             switch(pool_type)
             {
                 case PoolingType::AVG:
-                    _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::AVG, false>;
+                    if(is_nchw)
+                    {
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_qasymm8_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_qasymm8_nchw<PoolingType::AVG, false>;
+                    }
+                    else
+                    {
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, false>;
+                    }
                     break;
                 case PoolingType::MAX:
-                    _func = &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::MAX>;
+                    if(is_nchw)
+                    {
+                        _func = &NEPoolingLayerKernel::pooling2_qasymm8_nchw<PoolingType::MAX>;
+                    }
+                    else
+                    {
+                        _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::MAX>;
+                    }
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -477,10 +554,24 @@
             switch(pool_type)
             {
                 case PoolingType::AVG:
-                    _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::AVG, false>;
+                    if(is_nchw)
+                    {
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_qasymm8_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_qasymm8_nchw<PoolingType::AVG, false>;
+                    }
+                    else
+                    {
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, false>;
+                    }
                     break;
                 case PoolingType::MAX:
-                    _func = &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::MAX>;
+                    if(is_nchw)
+                    {
+                        _func = &NEPoolingLayerKernel::pooling3_qasymm8_nchw<PoolingType::MAX>;
+                    }
+                    else
+                    {
+                        _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::MAX>;
+                    }
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -491,10 +582,24 @@
             switch(pool_type)
             {
                 case PoolingType::AVG:
-                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8<PoolingType::AVG, false>;
+                    if(is_nchw)
+                    {
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw<PoolingType::AVG, false>;
+                    }
+                    else
+                    {
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, false>;
+                    }
                     break;
                 case PoolingType::MAX:
-                    _func = &NEPoolingLayerKernel::poolingMxN_qasymm8<PoolingType::MAX>;
+                    if(is_nchw)
+                    {
+                        _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw<PoolingType::MAX>;
+                    }
+                    else
+                    {
+                        _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::MAX>;
+                    }
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -511,10 +616,10 @@
                     switch(pool_type)
                     {
                         case PoolingType::AVG:
-                            _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::AVG>;
+                            _func = &NEPoolingLayerKernel::pooling2_q16_nchw<PoolingType::AVG>;
                             break;
                         case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::pooling2_q16<PoolingType::MAX>;
+                            _func = &NEPoolingLayerKernel::pooling2_q16_nchw<PoolingType::MAX>;
                             break;
                         default:
                             ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -524,10 +629,10 @@
                     switch(pool_type)
                     {
                         case PoolingType::AVG:
-                            _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
+                            _func = &NEPoolingLayerKernel::pooling3_q16_nchw<PoolingType::AVG>;
                             break;
                         case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
+                            _func = &NEPoolingLayerKernel::pooling3_q16_nchw<PoolingType::MAX>;
                             break;
                         default:
                             ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -537,7 +642,7 @@
                     switch(pool_type)
                     {
                         case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::poolingMxN_q16<PoolingType::MAX>;
+                            _func = &NEPoolingLayerKernel::poolingMxN_q16_nchw<PoolingType::MAX>;
                             break;
                         default:
                             ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -550,7 +655,7 @@
             switch(pool_type)
             {
                 case PoolingType::MAX:
-                    _func = &NEPoolingLayerKernel::poolingMxN_q16<PoolingType::MAX>;
+                    _func = &NEPoolingLayerKernel::poolingMxN_q16_nchw<PoolingType::MAX>;
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -567,13 +672,34 @@
                     switch(pool_type)
                     {
                         case PoolingType::AVG:
-                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, false>;
+                            if(is_nchw)
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::AVG, false>;
+                            }
+                            else
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
+                            }
                             break;
                         case PoolingType::L2:
-                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, false>;
+                            if(is_nchw)
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::L2, false>;
+                            }
+                            else
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
+                            }
                             break;
                         case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX, false>;
+                            if(is_nchw)
+                            {
+                                _func = &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::MAX, false>;
+                            }
+                            else
+                            {
+                                _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
+                            }
                             break;
                         default:
                             ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -583,13 +709,34 @@
                     switch(pool_type)
                     {
                         case PoolingType::AVG:
-                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, false>;
+                            if(is_nchw)
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::AVG, false>;
+                            }
+                            else
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
+                            }
                             break;
                         case PoolingType::L2:
-                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, false>;
+                            if(is_nchw)
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::L2, false>;
+                            }
+                            else
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
+                            }
                             break;
                         case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX, false>;
+                            if(is_nchw)
+                            {
+                                _func = &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::MAX, false>;
+                            }
+                            else
+                            {
+                                _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
+                            }
                             break;
                         default:
                             ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -599,13 +746,34 @@
                     switch(pool_type)
                     {
                         case PoolingType::AVG:
-                            _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, false>;
+                            if(is_nchw)
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, false>;
+                            }
+                            else
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
+                            }
                             break;
                         case PoolingType::L2:
-                            _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, false>;
+                            if(is_nchw)
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, false>;
+                            }
+                            else
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
+                            }
                             break;
                         case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::MAX, false>;
+                            if(is_nchw)
+                            {
+                                _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::MAX, false>;
+                            }
+                            else
+                            {
+                                _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
+                            }
                             break;
                         default:
                             ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -618,13 +786,34 @@
             switch(pool_type)
             {
                 case PoolingType::AVG:
-                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::AVG, false>;
+                    if(is_nchw)
+                    {
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, false>;
+                    }
+                    else
+                    {
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
+                    }
                     break;
                 case PoolingType::L2:
-                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::L2, false>;
+                    if(is_nchw)
+                    {
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, false>;
+                    }
+                    else
+                    {
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
+                    }
                     break;
                 case PoolingType::MAX:
-                    _func = &NEPoolingLayerKernel::poolingMxN_f16<PoolingType::MAX, false>;
+                    if(is_nchw)
+                    {
+                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::MAX, false>;
+                    }
+                    else
+                    {
+                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
+                    }
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -641,13 +830,34 @@
                     switch(pool_type)
                     {
                         case PoolingType::AVG:
-                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, false>;
+                            if(is_nchw)
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::AVG, false>;
+                            }
+                            else
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
+                            }
                             break;
                         case PoolingType::L2:
-                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, false>;
+                            if(is_nchw)
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::L2, false>;
+                            }
+                            else
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
+                            }
                             break;
                         case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX, false>;
+                            if(is_nchw)
+                            {
+                                _func = &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::MAX, false>;
+                            }
+                            else
+                            {
+                                _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
+                            }
                             break;
                         default:
                             ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -657,13 +867,34 @@
                     switch(pool_type)
                     {
                         case PoolingType::AVG:
-                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, false>;
+                            if(is_nchw)
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::AVG, false>;
+                            }
+                            else
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
+                            }
                             break;
                         case PoolingType::L2:
-                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, false>;
+                            if(is_nchw)
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::L2, false>;
+                            }
+                            else
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
+                            }
                             break;
                         case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX, false>;
+                            if(is_nchw)
+                            {
+                                _func = &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::MAX, false>;
+                            }
+                            else
+                            {
+                                _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
+                            }
                             break;
                         default:
                             ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -673,13 +904,34 @@
                     switch(pool_type)
                     {
                         case PoolingType::AVG:
-                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, false>;
+                            if(is_nchw)
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::AVG, false>;
+                            }
+                            else
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
+                            }
                             break;
                         case PoolingType::L2:
-                            _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, false>;
+                            if(is_nchw)
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::L2, false>;
+                            }
+                            else
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
+                            }
                             break;
                         case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX, false>;
+                            if(is_nchw)
+                            {
+                                _func = &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::MAX, false>;
+                            }
+                            else
+                            {
+                                _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
+                            }
                             break;
                         default:
                             ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -689,13 +941,34 @@
                     switch(pool_type)
                     {
                         case PoolingType::AVG:
-                            _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, false>;
+                            if(is_nchw)
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, false>;
+                            }
+                            else
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
+                            }
                             break;
                         case PoolingType::L2:
-                            _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, false>;
+                            if(is_nchw)
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, false>;
+                            }
+                            else
+                            {
+                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
+                            }
                             break;
                         case PoolingType::MAX:
-                            _func = &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::MAX, false>;
+                            if(is_nchw)
+                            {
+                                _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::MAX, false>;
+                            }
+                            else
+                            {
+                                _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
+                            }
                             break;
                         default:
                             ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -708,13 +981,34 @@
             switch(pool_type)
             {
                 case PoolingType::AVG:
-                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::AVG, false>;
+                    if(is_nchw)
+                    {
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, false>;
+                    }
+                    else
+                    {
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
+                    }
                     break;
                 case PoolingType::L2:
-                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::L2, false>;
+                    if(is_nchw)
+                    {
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, false>;
+                    }
+                    else
+                    {
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
+                    }
                     break;
                 case PoolingType::MAX:
-                    _func = &NEPoolingLayerKernel::poolingMxN_f32<PoolingType::MAX, false>;
+                    if(is_nchw)
+                    {
+                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::MAX, false>;
+                    }
+                    else
+                    {
+                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
+                    }
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -729,7 +1023,7 @@
 }
 
 template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -794,7 +1088,7 @@
 }
 
 template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling2_qasymm8(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -908,7 +1202,7 @@
 }
 
 template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_q16_nchw(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -973,7 +1267,7 @@
 }
 
 template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window)
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator input(_input, window_input);
@@ -1012,7 +1306,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float       scale   = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float       scale   = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
             const float16x4_t scale_v = vdup_n_f16(scale);
             // Perform pooling
             const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
@@ -1043,7 +1337,7 @@
 }
 
 template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window)
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator      input(_input, window_input);
@@ -1078,7 +1372,7 @@
 
         if(pooling_type != PoolingType::MAX)
         {
-            const float       scale   = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float       scale   = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
             const float16x8_t scale_v = vdupq_n_f16(scale);
             res                       = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
         }
@@ -1105,7 +1399,7 @@
 }
 
 template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1141,7 +1435,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            float             scale   = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            float             scale   = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
             const float32x2_t scale_v = vdup_n_f32(scale);
 
             // Perform pooling
@@ -1168,7 +1462,7 @@
 }
 
 template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1244,7 +1538,7 @@
 }
 
 template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling3_qasymm8(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1364,7 +1658,7 @@
 }
 
 template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_q16_nchw(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1435,7 +1729,7 @@
 }
 
 template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1474,7 +1768,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            float             scale   = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            float             scale   = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
             const float32x2_t scale_v = vdup_n_f32(scale);
 
             // Perform pooling
@@ -1503,7 +1797,7 @@
 }
 
 template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1532,7 +1826,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            float             scale   = calculate_avg_scale<exclude_padding>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            float             scale   = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
             const float32x2_t scale_v = vdup_n_f32(scale);
 
             // Perform pooling
@@ -1586,7 +1880,7 @@
 }
 
 template <PoolingType pooling_type>
-void NEPoolingLayerKernel::poolingMxN_q8(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1640,7 +1934,7 @@
 }
 
 template <PoolingType pooling_type>
-void NEPoolingLayerKernel::poolingMxN_q16(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_q16_nchw(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1690,7 +1984,7 @@
 }
 
 template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_f16(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window)
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator input(_input, window_input);
@@ -1716,7 +2010,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float scale = calculate_avg_scale<exclude_padding>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
             // Perform pooling
 
@@ -1813,7 +2107,116 @@
 }
 
 template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_f32(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window)
+{
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    const int pool_size_x     = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().width;
+    const int pool_size_y     = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().z() : _pool_info.pool_size().height;
+    const int pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int       pool_stride_x   = 0;
+    int       pool_stride_y   = 0;
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
+
+    float16x8_t vres;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int idx_width  = id.y() * pool_stride_x;
+        const int idx_height = id.z() * pool_stride_y;
+        if(pooling_type != PoolingType::MAX)
+        {
+            // Calculate scale
+            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NHWC>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                                                       pool_stride_y);
+            const float16x8_t scale_v = vdupq_n_f16(scale);
+
+            // Perform pooling
+            vres = vdupq_n_f16(0.0f);
+
+            for(int y = 0; y < pool_size_y; ++y)
+            {
+                if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+                {
+                    continue;
+                }
+
+                for(int x = 0; x < pool_size_x; ++x)
+                {
+                    if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
+                    {
+                        continue;
+                    }
+
+                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
+                                                                                           (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
+
+                    // Get power of 2 in case of l2 pooling and accumulate
+                    if(pooling_type == PoolingType::L2)
+                    {
+                        vres = vaddq_f16(vres, vmulq_f16(data, data));
+                    }
+                    else
+                    {
+                        vres = vaddq_f16(vres, data);
+                    }
+                }
+            }
+            // Divide by scale
+            vres = vmulq_f16(vres, scale_v);
+        }
+        else
+        {
+            vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
+            for(int y = 0; y < pool_size_y; ++y)
+            {
+                if(y + idx_height > window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+                {
+                    continue;
+                }
+
+                for(int x = 0; x < pool_size_x; ++x)
+                {
+                    if(x + idx_width > window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
+                    {
+                        continue;
+                    }
+
+                    const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
+                                                                                           (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
+                    vres                   = vmaxq_f16(vres, data);
+                }
+            }
+        }
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
+            vres                        = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
+        }
+
+        // Store result
+        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vres);
+    },
+    input, output);
+
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    ARM_COMPUTE_UNUSED(window_input);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+}
+
+template <PoolingType pooling_type, bool exclude_padding>
+void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1837,7 +2240,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float scale = calculate_avg_scale<exclude_padding>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
             // Perform pooling
             float32x4_t vres = vdupq_n_f32(0.0f);
@@ -1936,7 +2339,109 @@
 }
 
 template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_qasymm8(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    const int pool_size_x     = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().width;
+    const int pool_size_y     = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().z() : _pool_info.pool_size().height;
+    const int pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int       pool_stride_x   = 0;
+    int       pool_stride_y   = 0;
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
+
+    float32x4_t vres;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int idx_width  = id.y() * pool_stride_x;
+        const int idx_height = id.z() * pool_stride_y;
+        if(pooling_type != PoolingType::MAX)
+        {
+            // Calculate scale
+            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NHWC>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                                                       pool_stride_y);
+            const float32x4_t scale_v = vdupq_n_f32(scale);
+
+            // Perform pooling
+            vres = vdupq_n_f32(0.0f);
+
+            for(int y = 0; y < pool_size_y; ++y)
+            {
+                if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+                {
+                    continue;
+                }
+
+                for(int x = 0; x < pool_size_x; ++x)
+                {
+                    if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
+                    {
+                        continue;
+                    }
+
+                    const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
+                                                                                       (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
+
+                    // Get power of 2 in case of l2 pooling and accumulate
+                    if(pooling_type == PoolingType::L2)
+                    {
+                        vres = vmlaq_f32(vres, data, data);
+                    }
+                    else
+                    {
+                        vres = vaddq_f32(vres, data);
+                    }
+                }
+            }
+            // Divide by scale
+            vres = vmulq_f32(vres, scale_v);
+        }
+        else
+        {
+            vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
+            for(int y = 0; y < pool_size_y; ++y)
+            {
+                if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+                {
+                    continue;
+                }
+
+                for(int x = 0; x < pool_size_x; ++x)
+                {
+                    if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
+                    {
+                        continue;
+                    }
+
+                    const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
+                                                                                       (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
+                    vres                   = vmaxq_f32(vres, data);
+                }
+            }
+        }
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            float32x4_t sqrt_reciprocal = vrsqrteq_f32(vres);
+            vres                        = vmulq_f32(vres, vmulq_f32(vrsqrtsq_f32(vmulq_f32(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
+        }
+
+        // Store result
+        vst1q_f32(reinterpret_cast<float *>(output.ptr()), vres);
+    },
+    input, output);
+}
+
+template <PoolingType pooling_type, bool exclude_padding>
+void NEPoolingLayerKernel::poolingMxN_qasymm8_nchw(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1963,7 +2468,7 @@
             uint32_t   sres = 0;
 
             // Calculate scale
-            const float scale = calculate_avg_scale<exclude_padding>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
             // Perform pooling
             for(int y = 0; y < pool_size_y; ++y)
@@ -2031,6 +2536,101 @@
     input, output);
 }
 
+template <PoolingType pooling_type, bool exclude_padding>
+void NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    const int pool_size_x     = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().width;
+    const int pool_size_y     = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().z() : _pool_info.pool_size().height;
+    const int pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int       pool_stride_x   = 0;
+    int       pool_stride_y   = 0;
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int idx_width  = id.y() * pool_stride_x;
+        const int idx_height = id.z() * pool_stride_y;
+        if(pooling_type != PoolingType::MAX)
+        {
+            uint32x4_t vres1 = vdupq_n_u32(0);
+            uint32x4_t vres2 = vdupq_n_u32(0);
+
+            // Calculate scale
+            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NHWC>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                                                       pool_stride_y);
+            const float32x4_t scale_v = vdupq_n_f32(scale);
+
+            // Perform pooling
+            for(int y = 0; y < pool_size_y; ++y)
+            {
+                if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+                {
+                    continue;
+                }
+
+                for(int x = 0; x < pool_size_x; ++x)
+                {
+                    if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
+                    {
+                        continue;
+                    }
+
+                    const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
+                                                                                     (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
+
+                    const uint16x8_t data_u16 = vmovl_u8(data);
+                    vres1                     = vaddq_u32(vres1, vmovl_u16(vget_low_u16(data_u16)));
+                    vres2                     = vaddq_u32(vres2, vmovl_u16(vget_high_u16(data_u16)));
+                }
+            }
+            // Divide by scale
+            vres1 = vcvtq_u32_f32(vmulq_f32(vcvtq_f32_u32(vres1), scale_v));
+            vres2 = vcvtq_u32_f32(vmulq_f32(vcvtq_f32_u32(vres2), scale_v));
+
+            uint8x8_t res = vmovn_u16(vcombine_u16(vmovn_u32(vres1), vmovn_u32(vres2)));
+
+            // Store result
+            vst1_u8(output.ptr(), res);
+        }
+        else
+        {
+            uint8x8_t vres = vdup_n_u8(0);
+
+            for(int y = 0; y < pool_size_y; ++y)
+            {
+                if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+                {
+                    continue;
+                }
+
+                for(int x = 0; x < pool_size_x; ++x)
+                {
+                    if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
+                    {
+                        continue;
+                    }
+
+                    const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
+                                                                                     (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
+                    vres                 = vmax_u8(vres, data);
+                }
+            }
+
+            // Store result
+            vst1_u8(output.ptr(), vres);
+        }
+    },
+    input, output);
+}
+
 Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
@@ -2040,16 +2640,24 @@
     unsigned int num_elems_processed_per_iteration = 0;
     BorderSize   border_size(0);
 
-    const bool         is_global_pooling = pool_info.is_global_pooling();
-    const unsigned int pool_size_x       = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size().width;
-    const unsigned int pool_size_y       = is_global_pooling ? input->tensor_shape().y() : pool_info.pool_size().height;
+    const bool   is_global_pooling = pool_info.is_global_pooling();
+    unsigned int pool_size_x       = 0;
+    unsigned int pool_size_y       = 0;
+
+    // Get data layout
+    const DataLayout data_layout = input->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    pool_size_x = is_global_pooling ? input->dimension(idx_width) : pool_info.pool_size().width;
+    pool_size_y = is_global_pooling ? input->dimension(idx_height) : pool_info.pool_size().height;
 
     // Validate pool info before calling scaled_dimensions
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(pool_size_x, pool_size_y));
 
     // Check output dimensions
-    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
-                                                     input->dimension(1),
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(idx_width),
+                                                     input->dimension(idx_height),
                                                      pool_size_x,
                                                      pool_size_y,
                                                      pool_info.pad_stride_info());
@@ -2073,39 +2681,48 @@
     const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
     const unsigned int pool_size     = _pool_info.pool_size().width;
 
-    // Set step for input in x and y direction for the input
-    Window       window_input(window);
-    unsigned int window_x_inc = 0;
-    switch(_input->info()->data_type())
+    Window window_input(window);
+    if(_input->info()->data_layout() == DataLayout::NCHW)
     {
-        case DataType::QS8:
-        case DataType::QS16:
-        case DataType::F16:
+        // Set step for input in x and y direction for the input
+        unsigned int window_x_inc = 0;
+        switch(_input->info()->data_type())
         {
-            window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
-            break;
-        }
-        case DataType::QASYMM8:
-        {
-            window_x_inc = pool_stride_x;
-            if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
+            case DataType::QS8:
+            case DataType::QS16:
+            case DataType::F16:
             {
                 window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+                break;
             }
-            break;
+            case DataType::QASYMM8:
+            {
+                window_x_inc = pool_stride_x;
+                if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
+                {
+                    window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+                }
+                break;
+            }
+            case DataType::F32:
+            {
+                window_x_inc = pool_stride_x;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+            }
         }
-        case DataType::F32:
-        {
-            window_x_inc = pool_stride_x;
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Not supported");
-        }
+        window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
+        window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
     }
-    window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
-    window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
+    else
+    {
+        window_input.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), _num_elems_processed_per_iteration));
+        window_input.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
+        window_input.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
+    }
 
     // Run function
     (this->*_func)(window_input, window);

diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
index 767af08..ee23e76 100644
--- a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp

@@ -34,6 +34,46 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
+
+    if(output->tensor_shape().total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
+{
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::U8, 0);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+
+    // Configure window
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     min_max_access(min_max, 0, 0, 2, min_max->dimension(1));
+
+    // Update window and padding
+    bool window_changed = update_window_and_padding(win, input_access, output_access, min_max_access);
+
+    output_access.set_valid_region(win, input->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_tuple(err, win);
+}
+} // namespace
+
 NEQuantizationLayerKernel::NEQuantizationLayerKernel()
     : _input(nullptr), _output(nullptr), _min_max(nullptr)
 {
@@ -41,33 +81,27 @@
 
 void NEQuantizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() < 3);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, DataType::U8, 0);
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), min_max->info()));
 
     _input   = input;
     _output  = output;
     _min_max = min_max;
 
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), min_max->info());
 
-    // Configure window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     min_max_access(min_max->info(), 0, 0, 2, min_max->info()->dimension(1));
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
-    // Update window and padding
-    update_window_and_padding(win, input_access, output_access, min_max_access);
-    output_access.set_valid_region(win, input->info()->valid_region());
+    INEKernel::configure(std::get<1>(win_config));
+}
 
-    INEKernel::configure(win);
+Status NEQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, min_max));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), min_max->clone().get())));
+
+    return Status{};
 }
 
 void NEQuantizationLayerKernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 1a50ed8..30d42fa 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 
 #include <arm_neon.h>
@@ -94,6 +95,61 @@
             ARM_COMPUTE_ERROR("Unsupported reduction axis");
     }
 }
+
+TensorShape calculate_output_shape(const TensorShape &input_shape, unsigned int axis)
+{
+    TensorShape output_shape{ input_shape };
+    output_shape.set(axis, 1);
+
+    return output_shape;
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+    ARM_COMPUTE_UNUSED(op);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
+
+        const TensorShape output_shape         = calculate_output_shape(input->tensor_shape(), axis);
+        const TensorInfo  tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
+    }
+
+    return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int axis)
+{
+    // Calculate output shape and set if empty
+    const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->fixed_point_position());
+
+    unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+
+    return std::make_tuple(err, win);
+}
 } // namespace
 
 NEReductionOperationKernel::NEReductionOperationKernel()
@@ -109,19 +165,8 @@
 void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
-    ARM_COMPUTE_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
 
-    // Calculate output shape and set if empty
-    TensorShape output_shape{ input->info()->tensor_shape() };
-    output_shape.set(axis, 1);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
 
     unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type());
 
@@ -131,14 +176,19 @@
     _op          = op;
 
     // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
 
-    update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
-    INEKernel::configure(win);
+    INEKernel::configure(std::get<1>(win_config));
+}
+
+Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
+
+    return Status{};
 }
 
 void NEReductionOperationKernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 1918a77..7111644 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,125 +28,337 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Utility.h"
 
 #include <arm_neon.h>
 #include <cstddef>
 #include <cstdint>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
+                          const ITensorInfo *offsets, ITensorInfo *output, InterpolationPolicy policy,
+                          BorderMode border_mode, SamplingPolicy sampling_policy)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(output == input);
+    ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+    ARM_COMPUTE_UNUSED(border_mode);
+
+    const DataLayout data_layout = input->data_layout();
+    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)) == 0);
+
+    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
+    }
+
+    if(policy == InterpolationPolicy::BILINEAR)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
+    }
+
+    if(policy == InterpolationPolicy::AREA)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *dx, ITensorInfo *dy, ITensorInfo *offsets, ITensorInfo *output,
+                                                             InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy, BorderSize border_size)
+{
+    bool   window_changed{ false };
+    Window win{};
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    const ValidRegion &input_valid_region = input->valid_region();
+
+    if(offsets != nullptr)
+    {
+        AccessWindowHorizontal offsets_access(offsets, 0, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win, offsets_access);
+    }
+    if(dx != nullptr && dy != nullptr)
+    {
+        AccessWindowHorizontal dx_access(dx, 0, num_elems_processed_per_iteration);
+        AccessWindowHorizontal dy_access(dy, 0, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win, dx_access, dy_access);
+    }
+
+    // Reads can occur within the valid region of the input
+    AccessWindowStatic input_access(input, input_valid_region.anchor[0] - border_size.left,
+                                    input_valid_region.anchor[1] - border_size.top,
+                                    input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size.right,
+                                    input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size.bottom);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    window_changed = window_changed || update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, calculate_valid_region_scale(*input, output->tensor_shape(),
+                                                                     policy, sampling_policy, border_undefined));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+std::pair<Status, Window> validate_and_configure_window_nhwc(ITensorInfo *input, ITensorInfo *output,
+                                                             InterpolationPolicy policy, bool border_undefined,
+                                                             SamplingPolicy sampling_policy, BorderSize border_size)
+{
+    bool   window_changed{ false };
+    Window win{};
+
+    const unsigned int num_elems_processed_per_iteration = (policy == InterpolationPolicy::NEAREST_NEIGHBOR) ? 16 / input->element_size() : 1;
+
+    // Configure kernel window
+    win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic input_access(input, 0, -border_size.top,
+                                    ceil_to_multiple(input->tensor_shape()[0], num_elems_processed_per_iteration),
+                                    input->tensor_shape()[1]);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    window_changed = update_window_and_padding(win, input_access, output_access);
+    output->set_valid_region(calculate_valid_region_scale(*input, output->tensor_shape(),
+                                                          policy, sampling_policy, border_undefined));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *dx, ITensorInfo *dy, ITensorInfo *offsets, ITensorInfo *output,
+                                                        InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy, BorderSize border_size)
+{
+    std::pair<Status, Window> win_config;
+    switch(input->data_layout())
+    {
+        case DataLayout::NCHW:
+            win_config = validate_and_configure_window_nchw(input, dx, dy, offsets, output, policy, border_undefined, sampling_policy, border_size);
+            break;
+        case DataLayout::NHWC:
+            win_config = validate_and_configure_window_nhwc(input, output, policy, border_undefined, sampling_policy, border_size);
+            break;
+        default:
+            win_config = std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported data layout!"), Window{});
+    }
+
+    return win_config;
+}
+
+template <typename T>
+inline void scale_nearest_nhwc_core(const ITensor *input, const ITensor *offsets, ITensor *output,
+                                    float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c)
+{
+    Iterator in(input, win_in);
+    Iterator out(output, window);
+
+    const size_t offsets_stride = stride_w / sizeof(T);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+        const int  in_yi      = (id.z() + 0.5f) * hr;
+        const int  offset_row = in_yi * stride_h + id.x() * stride_c;
+        wrapper::vstore(reinterpret_cast<T *>(out.ptr()),
+                        wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + offset * offsets_stride + offset_row)));
+    },
+    in, out);
+}
+
+template <typename T>
+inline void scale_bilinear_nhwc_core(const ITensor *input, const ITensor *offsets, const ITensor *dx, const ITensor *dy, ITensor *output,
+                                     float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, BorderMode border_mode)
+{
+    Iterator in(input, win_in);
+    Iterator out(output, window);
+
+    const size_t stride_w_elems = stride_w / sizeof(T);
+    const size_t stride_h_elems = stride_h / sizeof(T);
+
+    const int input_width  = input->info()->dimension(1);
+    const int input_height = input->info()->dimension(2);
+
+    const T *border_area = reinterpret_cast<T *>(input->buffer() + input->info()->offset_first_element_in_bytes() - stride_w);
+
+    auto is_valid = [](int x, int low_x, int high_x, int y, int low_y, int high_y)
+    {
+        return !(x < low_x || x > high_x || y < low_y || y > high_y);
+    };
+
+    int border_size = (border_mode == BorderMode::UNDEFINED) ? 0 : 1;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto offset     = (*reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())))) / static_cast<int>(sizeof(T));
+        const auto dx_scale   = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+        const auto dy_scale   = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+        const int  in_yi      = std::floor((id.z() + 0.5f) * hr - 0.5f);
+        const int  offset_row = in_yi * stride_h + id.x() * stride_c;
+        const T   *in_ptr     = reinterpret_cast<T *>(in.ptr() + offset * stride_w + offset_row);
+
+        if(is_valid(offset, -border_size, input_width - 1 + border_size, in_yi, -border_size, input_height - 1 + border_size))
+        {
+            T a00 = 0, a01 = 0, a10 = 0, a11 = 0;
+
+            if(border_mode == BorderMode::CONSTANT)
+            {
+                a00 = is_valid(offset, 0, input_width - 1, in_yi, 0, input_height - 1) ? *in_ptr : *border_area;
+                a01 = is_valid(offset + 1, 0, input_width - 1, in_yi, 0, input_height - 1) ? *(in_ptr + stride_w_elems) : *border_area;
+                a10 = is_valid(offset, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems) : *border_area;
+                a11 = is_valid(offset + 1, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems + stride_w_elems) : *border_area;
+            }
+            else if(border_mode == BorderMode::REPLICATE)
+            {
+                auto clamped_x  = utility::clamp<int>(offset, 0, input_width - 1);
+                auto clamped_x1 = utility::clamp<int>(offset + 1, 0, input_width - 1);
+                auto clamped_y  = utility::clamp<int>(in_yi, 0, input_height - 1);
+                auto clamped_y1 = utility::clamp<int>(in_yi + 1, 0, input_height - 1);
+
+                a00 = *reinterpret_cast<T *>(in.ptr() + clamped_x * stride_w + clamped_y * stride_h + id.x() * stride_c);
+                a01 = *reinterpret_cast<T *>(in.ptr() + clamped_x1 * stride_w + clamped_y * stride_h + id.x() * stride_c);
+                a10 = *reinterpret_cast<T *>(in.ptr() + clamped_x * stride_w + clamped_y1 * stride_h + id.x() * stride_c);
+                a11 = *reinterpret_cast<T *>(in.ptr() + clamped_x1 * stride_w + clamped_y1 * stride_h + id.x() * stride_c);
+            }
+            else
+            {
+                a00 = is_valid(offset, 0, input_width - 1, in_yi, 0, input_height - 1) ? *in_ptr : 0;
+                a01 = is_valid(offset + 1, 0, input_width - 1, in_yi, 0, input_height - 1) ? *(in_ptr + stride_w_elems) : 0;
+                a10 = is_valid(offset, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems) : 0;
+                a11 = is_valid(offset + 1, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems + stride_w_elems) : 0;
+            }
+
+            // Perform interpolation
+            const float dx1 = 1.0f - dx_scale;
+            const float dy1 = 1.0f - dy_scale;
+
+            const float w1 = dx1 * dy1;
+            const float w2 = dx_scale * dy1;
+            const float w3 = dx1 * dy_scale;
+            const float w4 = dx_scale * dy_scale;
+
+            // Store result
+            *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
+        }
+        else
+        {
+            if(border_mode == BorderMode::CONSTANT)
+            {
+                *reinterpret_cast<T *>(out.ptr()) = *border_area;
+            }
+            else if(border_mode == BorderMode::REPLICATE)
+            {
+                auto clamped_x                    = utility::clamp<int>(offset, 0, input_width - 1);
+                auto clamped_y                    = utility::clamp<int>(in_yi, 0, input_height - 1);
+                *reinterpret_cast<T *>(out.ptr()) = *reinterpret_cast<T *>(in.ptr() + clamped_x * stride_w + clamped_y * stride_h + id.x() * stride_c);
+            }
+        }
+    },
+    in, out);
+}
+} // namespace
 
 NEScaleKernel::NEScaleKernel()
-    : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr)
+    : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_size(1), _border_mode()
 {
 }
 
 BorderSize NEScaleKernel::border_size() const
 {
-    return BorderSize(1);
+    return _border_size;
 }
 
-void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined,
-                              SamplingPolicy sampling_policy)
+void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets,
+                              ITensor *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(output == input);
-    ARM_COMPUTE_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
-    ARM_COMPUTE_UNUSED(sampling_policy);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
+                                                  dx != nullptr ? dx->info() : nullptr,
+                                                  dy != nullptr ? dy->info() : nullptr,
+                                                  offsets != nullptr ? offsets->info() : nullptr,
+                                                  output->info(),
+                                                  policy, border_mode, sampling_policy));
+
+    // Get data layout and width/height indices
+    const DataLayout data_layout = input->info()->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    _input       = input;
+    _output      = output;
+    _offsets     = offsets;
+    _dx          = dx;
+    _dy          = dy;
+    _policy      = policy;
+    _border_size = BorderSize(1);
+    _border_mode = border_mode;
+
+    // Compute the ratio between source width/height and destination width/height
+    const auto wr = static_cast<float>(input->info()->dimension(idx_width)) / static_cast<float>(output->info()->dimension(idx_width));
+    const auto hr = static_cast<float>(input->info()->dimension(idx_height)) / static_cast<float>(output->info()->dimension(idx_height));
+
+    // Add constant border only on top in case of NHWC layout
+    if(data_layout == DataLayout::NHWC)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
+        _border_size = (border_mode == BorderMode::CONSTANT && policy == InterpolationPolicy::BILINEAR) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
     }
 
-    if(policy == InterpolationPolicy::BILINEAR)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32);
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
-    }
-
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) == 0);
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) == 0);
-
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
-    }
-
-    _input   = input;
-    _output  = output;
-    _offsets = offsets;
-    _dx      = dx;
-    _dy      = dy;
-
-    /* Compute the ratio between source width/height and destination width/height */
-    const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
-    const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
-
-    /* Area interpolation behaves as Nearest Neighbour in case of up-sampling */
+    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
     if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
     {
         policy = InterpolationPolicy::NEAREST_NEIGHBOR;
     }
 
+    // Select interpolation function
     switch(policy)
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
         {
-            _func = &NEScaleKernel::scale_nearest;
+            _func = (data_layout == DataLayout::NCHW) ? &NEScaleKernel::scale_nearest_nchw : &NEScaleKernel::scale_nhwc;
             break;
         }
         case InterpolationPolicy::BILINEAR:
         {
-            ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_dx, 1, DataType::F32);
-            ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_dy, 1, DataType::F32);
-
-            _func = &NEScaleKernel::scale_bilinear;
+            _func = (data_layout == DataLayout::NCHW) ? &NEScaleKernel::scale_bilinear_nchw : &NEScaleKernel::scale_nhwc;
             break;
         }
         case InterpolationPolicy::AREA:
         {
-            _func = &NEScaleKernel::scale_area;
+            _func = &NEScaleKernel::scale_area_nchw;
             break;
         }
         default:
             ARM_COMPUTE_ERROR("Unsupported interpolation mode");
     }
 
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
-    const ValidRegion &input_valid_region = input->info()->valid_region();
-
-    // Reads can occur within the valid region of the input
-    AccessWindowStatic input_access(input->info(),
-                                    input_valid_region.anchor[0] - border_size().left, input_valid_region.anchor[1] - border_size().top,
-                                    input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size().right,
-                                    input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size().bottom);
-    AccessWindowHorizontal offsets_access(offsets == nullptr ? nullptr : offsets->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal dx_access(dx == nullptr ? nullptr : dx->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal dy_access(dy == nullptr ? nullptr : dy->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              input_access,
-                              offsets_access,
-                              dx_access,
-                              dy_access,
-                              output_access);
-
-    output_access.set_valid_region(win, calculate_valid_region_scale(*(input->info()), output->info()->tensor_shape(), policy, border_size(), border_undefined));
-    INEKernel::configure(win);
+    // Configure window
+    std::pair<Status, Window> win_config = validate_and_configure_window(input->info(),
+                                                                         dx != nullptr ? dx->info() : nullptr,
+                                                                         dy != nullptr ? dy->info() : nullptr,
+                                                                         offsets != nullptr ? offsets->info() : nullptr,
+                                                                         output->info(),
+                                                                         policy, border_mode == BorderMode::UNDEFINED, sampling_policy, border_size());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
 }
 
-void NEScaleKernel::scale_nearest(const Window &window)
+void NEScaleKernel::scale_nearest_nchw(const Window &window)
 {
     const size_t input_stride = _input->info()->strides_in_bytes()[1];
 
@@ -159,15 +371,16 @@
     win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
     win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
 
+    // Set offsets window
     Window win_off;
     win_off.set(Window::DimX, window[Window::DimX]);
     win_off.set(Window::DimY, window[Window::DimY]);
-
     for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d)
     {
         win_off.set(d, Window::Dimension(0, 0, 0));
     }
 
+    // Create iterators
     Iterator in(_input, win_in);
     Iterator out(_output, window);
     Iterator offsets(_offsets, win_off);
@@ -300,7 +513,7 @@
     }
 }
 
-void NEScaleKernel::scale_bilinear(const Window &window)
+void NEScaleKernel::scale_bilinear_nchw(const Window &window)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::S16, DataType::F32);
 
@@ -465,15 +678,16 @@
     }
 }
 
-void NEScaleKernel::scale_area(const Window &window)
+void NEScaleKernel::scale_area_nchw(const Window &window)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
 
-    // Don't increment in X and Y direction for the input tensor
+    // Don't increment in width/height/channels for the input tensor
     // A pointer to the start of this plane is needed as base for the precomputed offsets
     Window win_in(window);
     win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
     win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
     Iterator in(_input, win_in);
     Iterator out(_output, window);
@@ -513,6 +727,97 @@
     in, out);
 }
 
+void NEScaleKernel::scale_nhwc(const Window &window)
+{
+    // Get data layout and width/height indices
+    const DataLayout data_layout  = _input->info()->data_layout();
+    const int        idx_channels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int        idx_width    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    const size_t input_stride_w = _input->info()->strides_in_bytes()[idx_width];
+    const size_t input_stride_h = _input->info()->strides_in_bytes()[idx_height];
+    const size_t input_stride_c = _input->info()->strides_in_bytes()[idx_channels];
+
+    // Compute the ratio between source height and destination height
+    const auto hr = static_cast<float>(_input->info()->dimension(idx_height)) / static_cast<float>(_output->info()->dimension(idx_height));
+
+    // Don't increment in width/height/channels for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::U8:
+        {
+            if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+            {
+                scale_nearest_nhwc_core<uint8_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c);
+            }
+            else
+            {
+                scale_bilinear_nhwc_core<uint8_t>(_input, _offsets, _dx, _dy, _output, hr,
+                                                  window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
+            }
+            break;
+        }
+        case DataType::S16:
+        {
+            if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+            {
+                scale_nearest_nhwc_core<int16_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c);
+            }
+            else
+            {
+                scale_bilinear_nhwc_core<int16_t>(_input, _offsets, _dx, _dy, _output, hr,
+                                                  window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
+            }
+            break;
+        }
+        case DataType::F32:
+        {
+            if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+            {
+                scale_nearest_nhwc_core<float>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c);
+            }
+            else
+            {
+                scale_bilinear_nhwc_core<float>(_input, _offsets, _dx, _dy, _output, hr,
+                                                window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+}
+
+Status NEScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
+                               const ITensorInfo *offsets, ITensorInfo *output, InterpolationPolicy policy,
+                               BorderMode border_mode, SamplingPolicy sampling_policy)
+{
+    BorderSize border_size(1);
+    if(input->data_layout() == DataLayout::NHWC)
+    {
+        border_size = (border_mode == BorderMode::CONSTANT && policy == InterpolationPolicy::BILINEAR) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, policy, border_mode, sampling_policy));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                              dx != nullptr ? dx->clone().get() : nullptr,
+                                                              dy != nullptr ? dy->clone().get() : nullptr,
+                                                              offsets != nullptr ? offsets->clone().get() : nullptr,
+                                                              output->clone().get(),
+                                                              policy, border_mode == BorderMode::UNDEFINED, sampling_policy, border_size)
+                                .first);
+
+    return Status{};
+}
+
 void NEScaleKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
@@ -522,3 +827,4 @@
 
     (this->*_func)(window);
 }
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index 13d87a0..d91efd2 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp

@@ -33,7 +33,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/utility.h"
+#include "arm_compute/core/utils/misc/Utility.h"
 
 #include <algorithm>
 #include <arm_neon.h>

diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index c863ed4..e6f3acc 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 

diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 1501402..3031a87 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp

@@ -34,12 +34,16 @@
 
 namespace
 {
-template <typename T>
+template <typename T, bool is_nhwc>
 void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window)
 {
-    const unsigned int kernel_size_x   = input->info()->dimension(0);
-    const unsigned int kernel_size_y   = input->info()->dimension(1);
-    const unsigned int kernel_depth    = input->info()->dimension(2);
+    DataLayout         data_layout     = input->info()->data_layout();
+    const int          idx_width       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int          idx_height      = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int          idx_channel     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const unsigned int kernel_size_x   = input->info()->dimension(idx_width);
+    const unsigned int kernel_size_y   = input->info()->dimension(idx_height);
+    const unsigned int kernel_depth    = input->info()->dimension(idx_channel);
     const unsigned int input_stride_x  = input->info()->strides_in_bytes().x();
     const unsigned int input_stride_y  = input->info()->strides_in_bytes().y();
     const unsigned int input_stride_z  = input->info()->strides_in_bytes().z();
@@ -67,13 +71,13 @@
                 for(unsigned int i = 0; i < kernel_size_x; ++i)
                 {
                     *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(tmp_input_ptr));
-                    tmp_input_ptr += input_stride_x;
+                    tmp_input_ptr += is_nhwc ? input_stride_y : input_stride_x;
                     tmp_output_ptr += output_stride_y;
                 }
-                curr_input_row_ptr += input_stride_y;
+                curr_input_row_ptr += is_nhwc ? input_stride_z : input_stride_y;
                 tmp_input_ptr = curr_input_row_ptr;
             }
-            curr_input_depth_ptr += input_stride_z;
+            curr_input_depth_ptr += is_nhwc ? input_stride_x : input_stride_z;
             curr_input_row_ptr = curr_input_depth_ptr;
             tmp_input_ptr      = curr_input_depth_ptr;
         }
@@ -161,21 +165,24 @@
     _bias   = bias;
     _output = output;
 
+    const DataLayout data_layout = input->info()->data_layout();
+    const bool       is_nhwc     = data_layout == DataLayout::NHWC;
+
     switch(_input->info()->element_size())
     {
         case 4:
         {
-            _func = &weights_reshape<uint32_t>;
+            _func = is_nhwc ? &weights_reshape<uint32_t, true> : &weights_reshape<uint32_t, false>;
             break;
         }
         case 2:
         {
-            _func = &weights_reshape<uint16_t>;
+            _func = is_nhwc ? &weights_reshape<uint16_t, true> : &weights_reshape<uint16_t, false>;
             break;
         }
         case 1:
         {
-            _func = &weights_reshape<uint8_t>;
+            _func = is_nhwc ? &weights_reshape<uint8_t, true> : &weights_reshape<uint8_t, false>;
             break;
         }
         default:

diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
new file mode 100644
index 0000000..672684d
--- /dev/null
+++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp

@@ -0,0 +1,555 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+//Batched Gemms
+
+namespace
+{
+Status validate_arguments_winograd_gemm(const ITensorInfo *a, const ITensorInfo *b, const ITensor *c, const ITensorInfo *output, const float alpha, const float beta,
+                                        const GEMMInfo &gemm_info = GEMMInfo())
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(b);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+    if(c != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info());
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The matrix C must have the same number of rows as the matrix A");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The matrix C must have the same number of columns as the matrix B");
+    }
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != output->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != output->dimension(1), "The output matrix must have the same number of rows as the matrix A");
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() != a->num_dimensions());
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_UNUSED(alpha, beta);
+    return Status{};
+}
+
+Status validate_arguments_winograd_weight_trans(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+
+    const size_t idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != 3 && input->dimension(idx_width) != 5);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != input->dimension(idx_height));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+    const Size2D &output_tile = winograd_info.output_tile_size;
+    ARM_COMPUTE_RETURN_ERROR_ON(output_tile != Size2D(2U, 2U) && output_tile != Size2D(4U, 4U));
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_filter_transform_shape(*input, winograd_info));
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_winograd_weight_trans(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    const Size2D kernel_dims = winograd_info.kernel_size;
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_filter_transform_shape(*input, winograd_info)));
+
+    unsigned int num_elems_processed_per_iteration_x = kernel_dims.width;
+    unsigned int num_elems_processed_per_iteration_y = kernel_dims.height;
+
+    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    bool   window_changed = false;
+
+    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    AccessWindowStatic    output_access(output, 0, 0, output->dimension(0), output->dimension(1));
+    window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+    Window win_collapsed = win.collapse(win, Window::DimZ);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+
+    return std::make_pair(err, win_collapsed);
+}
+
+Status validate_arguments_winograd_input_trans(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    const Size2D        &kernel_dims = winograd_info.kernel_size;
+    const PadStrideInfo &conv_info   = winograd_info.convolution_info;
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((kernel_dims.width != 3U && kernel_dims.width != 5U), "Winograd input transform only supports 3x3 and 5x5 kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((kernel_dims.width != kernel_dims.height), "Winograd input transform only supports 3x3 and 5x5 kernels");
+
+    // Validate configured output
+    if(output->total_size() != 0)
+    {
+        const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_winograd_input_trans(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    const PadStrideInfo conv_info        = winograd_info.convolution_info;
+    const Size2D        output_tile_size = winograd_info.output_tile_size;
+    const Size2D        kernel_dims      = winograd_info.kernel_size;
+    const TensorShape   output_shape     = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
+
+    unsigned int num_elems_read_per_iteration_x = (output_tile_size.width + kernel_dims.width - 1);
+    unsigned int num_elems_read_per_iteration_y = (output_tile_size.height + kernel_dims.height - 1);
+
+    Window win = calculate_max_window(*input, Steps(1, 1));
+
+    AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
+
+    bool window_changed = update_window_and_padding(win, input_access);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+Status validate_arguments_winograd_output_trans(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    const PadStrideInfo &conv_info   = winograd_info.convolution_info;
+    const Size2D         kernel_dims = winograd_info.kernel_size;
+
+    // Number of tiles along the X and Y direction
+    const unsigned int num_tiles_x = std::ceil((winograd_info.input_dimensions.x() - (kernel_dims.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast<float>
+                                               (winograd_info.output_tile_size.width));
+    const unsigned int num_tiles_y = std::ceil((winograd_info.input_dimensions.y() - (kernel_dims.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast<float>
+                                               (winograd_info.output_tile_size.height));
+    const Size2D       num_tiles   = Size2D(num_tiles_x, num_tiles_y);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(winograd_info.output_data_layout != DataLayout::NCHW);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != num_tiles.area());
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((kernel_dims.width != 3U && kernel_dims.width != 5U), "Winograd output transform only supports 3x3 and 5x5 kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((kernel_dims.width != kernel_dims.height), "Winograd output transform only supports 3x3 and 5x5 kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((input->dimension(2) != size_t(16U)) && (input->dimension(2) != size_t(36U))), "Only 2x2 and 4x4 output tile is supported");
+    ARM_COMPUTE_UNUSED(kernel_dims);
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != size_t(1));
+    }
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_output_transform_shape(*input, winograd_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_winograd_output_trans(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_output_transform_shape(*input, winograd_info)));
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    bool   window_changed = false;
+
+    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+    AccessWindowStatic    output_access(output, 0, 0, ceil_to_multiple(output->dimension(0), 2), ceil_to_multiple(output->dimension(1), 2));
+
+    if(bias != nullptr)
+    {
+        AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
+        window_changed = update_window_and_padding(win, input_access, bias_access, output_access);
+    }
+    else
+    {
+        window_changed = update_window_and_padding(win, input_access, output_access);
+    }
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerBatchedGEMMKernel()
+    : _gemms()
+{
+}
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
+    const unsigned int n_gemms,
+    const int M, const int K, const int N,
+    const int        a_matrix_stride,
+    const int        a_row_stride,
+    const int        b_matrix_stride,
+    const int        b_row_stride,
+    const int        c_matrix_stride,
+    const int        c_row_stride,
+    const TIn *const a_ptr,
+    const TIn *const b_ptr,
+    TOut *const      c_ptr)
+{
+    _gemms = support::cpp14::make_unique<MultiGEMM>(n_gemms, M, K, N, a_matrix_stride, a_row_stride, b_matrix_stride, b_row_stride, c_matrix_stride, c_row_stride, a_ptr, b_ptr, c_ptr);
+    Window win;
+    auto   win_last = _gemms->get_window();
+    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+    INEKernel::configure(win);
+}
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    const size_t first_gemm = window.x().start();
+    const size_t last_gemm  = window.x().end();
+    _gemms->run(first_gemm, last_gemm);
+}
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+unsigned int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_number_gemms() const
+{
+    return WinogradBase::N_GEMMS;
+}
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_tile_rows() const
+{
+    return _output_tile_rows;
+}
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_tile_cols() const
+{
+    return _output_tile_cols;
+}
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_number_blocks() const
+{
+    return WinogradConv::N_BLOCK;
+}
+
+template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+Status NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensor *c,
+                                                                                                                     const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_gemm(a, b, c, output, alpha, beta, gemm_info));
+    return Status{};
+}
+
+template class NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>;
+template class NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>;
+template class NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>;
+
+// Weights transform
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+unsigned int NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_weight_storage_size(int n_output_channels, int n_input_channels) const
+{
+    const KernelShape shape(n_output_channels, KernelRows, KernelCols, n_input_channels);
+    return static_cast<unsigned int>(
+               // WinogradConv returns the size in bytes, we divide by `sizeof(T)` to express that in units of T
+               WinogradConv::get_kernel_storage_size(shape) / sizeof(T));
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformWeightsKernel()
+    : _transform()
+{
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(const KernelShape &kernel_shape) const
+{
+    return WinogradConv::get_kernel_matrix_stride(kernel_shape);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
+    const ITensor *weights_hwio,
+    T *const       output,
+    const int      matrix_stride,     /** Stride across matrices in the output. */
+    const int      n_output_channels, /** Number of filters. */
+    const int      n_input_channels)  /** Number of channels in each filter. */
+{
+    const int matrix_row_stride = roundup(n_output_channels, WinogradConv::N_BLOCK);
+    _transform                  = support::cpp14::make_unique<WeightsTransform>(reinterpret_cast<T *>(weights_hwio->buffer()), output, matrix_stride, matrix_row_stride, n_output_channels,
+                                                                                n_input_channels);
+    Window win;
+    auto   win_last = _transform->get_window();
+    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+    INEKernel::configure(win);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    const size_t fst = window.x().start();
+    const size_t lst = window.x().end();
+    _transform->run(fst, lst);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+bool NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::is_parallelisable() const
+{
+    return false;
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+Status NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                                                                                                  const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_weight_trans(input, output, winograd_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_weight_trans(input->clone().get(), output->clone().get(), winograd_info).first);
+    return Status{};
+}
+
+template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>;
+template class NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>;
+template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>;
+
+// Input transform
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+unsigned int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_input_storage_size(
+    int  n_batches,   /** Number of batches in the input tensor. */
+    int  n_channels,  /** Number of feature maps in the input tensor. */
+    int  n_rows,      /** Number of rows in each feature map. */
+    int  n_cols,      /** Number of columns in each feature map. */
+    bool same_padding /** Use "SAME" padding, otherwise use "VALID". */
+) const
+{
+    // Construct shapes for the input and kernel tensors.
+    const Tensor4DShape input_shape(n_batches, n_rows, n_cols, n_channels);
+    const KernelShape   kern_shape(1, KernelRows, KernelCols, n_channels);
+    const PaddingType   padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
+    // Return the size, converted into units of TIn
+    return static_cast<unsigned int>(WinogradConv::get_input_storage_size(kern_shape, input_shape, padding) / sizeof(T));
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
+    const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const
+{
+    return WinogradConv::get_input_matrix_stride(kernel_shape, input_shape, padding_type);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformInputKernel()
+    : _transform()
+{
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
+    const T *const    input,         /** Input tensor data */
+    const int         n_batches,     /** Number of batches in input tensor. */
+    const int         n_rows,        /** Number of rows in input tensor. */
+    const int         n_cols,        /** Number of columns in input tensor. */
+    const int         n_channels,    /** Number of channels in input tensor. */
+    const PaddingType padding,       /** Padding type. */
+    T *const          output,        /** Base of output matrices. */
+    const int         matrix_stride) /** Stride between output matrices. */
+{
+    //  _input_matrix_row_stride(n_input_channels),
+    _transform = support::cpp14::make_unique<InputTransform>(input, n_batches, n_rows, n_cols, n_channels, padding, output, matrix_stride, n_channels);
+    Window win;
+    auto   win_last = _transform->get_window();
+    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+    INEKernel::configure(win);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    const size_t fst = window.x().start();
+    const size_t lst = window.x().end();
+    _transform->run(fst, lst);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+Status NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_input_trans(input, output, winograd_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_input_trans(input->clone().get(), output->clone().get(), winograd_info).first);
+
+    return Status{};
+}
+
+template class NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>;
+template class NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>;
+template class NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>;
+
+// Output transform
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+unsigned int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_storage_size(
+    int  n_batches,         /** Number of batches in the output tensor. */
+    int  n_rows,            /** Number of rows in each feature map of the input tensor. */
+    int  n_cols,            /** Number of columns in each feature map of the input tensor. */
+    int  n_output_channels, /** Number of feature maps in the output tensor. */
+    bool same_padding       /** Use "SAME" padding, otherwise use "VALID". */
+) const
+{
+    // Construct shapes for the input and kernel tensors.
+    const Tensor4DShape input_shape(n_batches, n_rows, n_cols, 1);
+    const KernelShape   kern_shape(n_output_channels, KernelRows, KernelCols, 1);
+    const PaddingType   padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
+
+    // Return the size, converted into units of TOut
+    return static_cast<unsigned int>(
+               WinogradConv::get_output_storage_size(kern_shape, input_shape, padding) / sizeof(T));
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformOutputKernel()
+    : _biases(nullptr), _output_workspace(nullptr), _matrix_stride(0), _matrix_row_stride(0), _output(nullptr), _n_batches(0), _n_rows(0), _n_cols(0), _n_channels(0)
+{
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
+    const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const
+{
+    return WinogradConv::get_output_matrix_stride(kernel_shape, input_shape, padding_type);
+}
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+Tensor4DShape NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_shape(
+    const KernelShape &kernel_shape, const Tensor4DShape &in_shape, const PaddingType padding) const
+{
+    return WinogradConv::get_output_shape(kernel_shape, in_shape, padding);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
+    const ITensor *biases,
+    const T *const output_workingspace,
+    const int      matrix_stride,
+    T *const       output,
+    const int      n_batches,
+    const int      n_rows,
+    const int      n_cols,
+    const int      n_channels)
+{
+    _biases            = biases;
+    _output_workspace  = output_workingspace;
+    _matrix_stride     = matrix_stride;
+    _matrix_row_stride = roundup(n_channels, WinogradConv::N_BLOCK);
+    _output            = output;
+    _n_batches         = n_batches;
+    _n_rows            = n_rows;
+    _n_cols            = n_cols;
+    _n_channels        = n_channels;
+
+    // We don't have the biases buffer at this stage as it hasn't been allocated, we pass in nullptr OutputTransform is only used here to compute the window
+    OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride, nullptr, _output, _n_batches, _n_rows, _n_cols, _n_channels);
+    Window          win;
+    auto            win_last = output_transform.get_window();
+    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+    INEKernel::configure(win);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_output_workspace);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_output);
+
+    OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride,
+                                     (_biases ? reinterpret_cast<T *>(_biases->buffer()) : nullptr), _output,
+                                     _n_batches, _n_rows, _n_cols, _n_channels);
+
+    // The code below cannot be moved to configure because biases hasn't been allocated at that point
+    const size_t fst = window.x().start();
+    const size_t lst = window.x().end();
+    output_transform.run(fst, lst);
+}
+
+template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
+Status NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
+                                                                                                                 const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_output_trans(input, (bias != nullptr ? bias->clone().get() : nullptr), output, winograd_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_output_trans(input->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), output->clone().get(),
+                                                                                    winograd_info)
+                                .first);
+
+    return Status{};
+}
+
+template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>;
+template class NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>;
+template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>;
+
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp
deleted file mode 100644
index b2e44f8..0000000
--- a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp
+++ /dev/null

@@ -1,325 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-//Batched Gemms
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerBatchedGEMMKernel()
-    : _gemms()
-{
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
-    const unsigned int n_gemms,
-    const int M, const int K, const int N,
-    const int        a_matrix_stride,
-    const int        a_row_stride,
-    const int        b_matrix_stride,
-    const int        b_row_stride,
-    const int        c_matrix_stride,
-    const int        c_row_stride,
-    const TIn *const a_ptr,
-    const TIn *const b_ptr,
-    TOut *const      c_ptr)
-{
-    _gemms = support::cpp14::make_unique<MultiGEMM>(n_gemms, M, K, N, a_matrix_stride, a_row_stride, b_matrix_stride, b_row_stride, c_matrix_stride, c_row_stride, a_ptr, b_ptr, c_ptr);
-    Window win;
-    auto   win_last = _gemms->get_window();
-    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
-    INEKernel::configure(win);
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    const size_t first_gemm = window.x().start();
-    const size_t last_gemm  = window.x().end();
-    _gemms->run(first_gemm, last_gemm);
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_number_gemms() const
-{
-    return WinogradBase::N_GEMMS;
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_tile_rows() const
-{
-    return _output_tile_rows;
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_tile_cols() const
-{
-    return _output_tile_cols;
-}
-
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerBatchedGEMMKernel<TIn, TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_number_blocks() const
-{
-    return WinogradConv::N_BLOCK;
-}
-
-template class NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>;
-template class NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>;
-
-// Weights transform
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_weight_storage_size(int n_output_channels, int n_input_channels) const
-{
-    const KernelShape shape(n_output_channels, KernelRows, KernelCols, n_input_channels);
-    return static_cast<unsigned int>(
-               // WinogradConv returns the size in bytes, we divide by `sizeof(T)` to express that in units of T
-               WinogradConv::get_kernel_storage_size(shape) / sizeof(T));
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformWeightsKernel()
-    : _transform()
-{
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(const KernelShape &kernel_shape) const
-{
-    return WinogradConv::get_kernel_matrix_stride(kernel_shape);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
-    const ITensor *weights_hwio,
-    T *const       output,
-    const int      matrix_stride,     /** Stride across matrices in the output. */
-    const int      n_output_channels, /** Number of filters. */
-    const int      n_input_channels)  /** Number of channels in each filter. */
-{
-    const int matrix_row_stride = roundup(n_output_channels, WinogradConv::N_BLOCK);
-    _transform                  = support::cpp14::make_unique<WeightsTransform>(reinterpret_cast<T *>(weights_hwio->buffer()), output, matrix_stride, matrix_row_stride, n_output_channels,
-                                                                                n_input_channels);
-    Window win;
-    auto   win_last = _transform->get_window();
-    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
-    INEKernel::configure(win);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    const size_t fst = window.x().start();
-    const size_t lst = window.x().end();
-    _transform->run(fst, lst);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-bool NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::is_parallelisable() const
-{
-    return false;
-}
-
-template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>;
-template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>;
-
-// Input transform
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_input_storage_size(
-    int  n_batches,   /** Number of batches in the input tensor. */
-    int  n_channels,  /** Number of feature maps in the input tensor. */
-    int  n_rows,      /** Number of rows in each feature map. */
-    int  n_cols,      /** Number of columns in each feature map. */
-    bool same_padding /** Use "SAME" padding, otherwise use "VALID". */
-) const
-{
-    // Construct shapes for the input and kernel tensors.
-    const Tensor4DShape input_shape(n_batches, n_rows, n_cols, n_channels);
-    const KernelShape   kern_shape(1, KernelRows, KernelCols, n_channels);
-    const PaddingType   padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
-    // Return the size, converted into units of TIn
-    return static_cast<unsigned int>(WinogradConv::get_input_storage_size(kern_shape, input_shape, padding) / sizeof(T));
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
-    const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const
-{
-    return WinogradConv::get_input_matrix_stride(kernel_shape, input_shape, padding_type);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformInputKernel()
-    : _transform()
-{
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
-    const T *const    input,         /** Input tensor data */
-    const int         n_batches,     /** Number of batches in input tensor. */
-    const int         n_rows,        /** Number of rows in input tensor. */
-    const int         n_cols,        /** Number of columns in input tensor. */
-    const int         n_channels,    /** Number of channels in input tensor. */
-    const PaddingType padding,       /** Padding type. */
-    T *const          output,        /** Base of output matrices. */
-    const int         matrix_stride) /** Stride between output matrices. */
-{
-    //  _input_matrix_row_stride(n_input_channels),
-    _transform = support::cpp14::make_unique<InputTransform>(input, n_batches, n_rows, n_cols, n_channels, padding, output, matrix_stride, n_channels);
-    Window win;
-    auto   win_last = _transform->get_window();
-    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
-    INEKernel::configure(win);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    const size_t fst = window.x().start();
-    const size_t lst = window.x().end();
-    _transform->run(fst, lst);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-bool NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::is_parallelisable() const
-{
-    return false;
-}
-
-template class NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>;
-template class NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>;
-
-// Output transform
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-unsigned int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_storage_size(
-    int  n_batches,         /** Number of batches in the output tensor. */
-    int  n_rows,            /** Number of rows in each feature map of the input tensor. */
-    int  n_cols,            /** Number of columns in each feature map of the input tensor. */
-    int  n_output_channels, /** Number of feature maps in the output tensor. */
-    bool same_padding       /** Use "SAME" padding, otherwise use "VALID". */
-) const
-{
-    // Construct shapes for the input and kernel tensors.
-    const Tensor4DShape input_shape(n_batches, n_rows, n_cols, 1);
-    const KernelShape   kern_shape(n_output_channels, KernelRows, KernelCols, 1);
-    const PaddingType   padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
-
-    // Return the size, converted into units of TOut
-    return static_cast<unsigned int>(
-               WinogradConv::get_output_storage_size(kern_shape, input_shape, padding) / sizeof(T));
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::NEWinogradLayerTransformOutputKernel()
-    : _biases(nullptr), _output_workspace(nullptr), _matrix_stride(0), _matrix_row_stride(0), _output(nullptr), _n_batches(0), _n_rows(0), _n_cols(0), _n_channels(0)
-{
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-int NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(
-    const KernelShape &kernel_shape, const Tensor4DShape &input_shape, const PaddingType padding_type) const
-{
-    return WinogradConv::get_output_matrix_stride(kernel_shape, input_shape, padding_type);
-}
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-Tensor4DShape NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_shape(
-    const KernelShape &kernel_shape, const Tensor4DShape &in_shape, const PaddingType padding) const
-{
-    return WinogradConv::get_output_shape(kernel_shape, in_shape, padding);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
-    const ITensor *biases,
-    const T *const output_workingspace,
-    const int      matrix_stride,
-    T *const       output,
-    const int      n_batches,
-    const int      n_rows,
-    const int      n_cols,
-    const int      n_channels)
-{
-    _biases            = biases;
-    _output_workspace  = output_workingspace;
-    _matrix_stride     = matrix_stride;
-    _matrix_row_stride = roundup(n_channels, WinogradConv::N_BLOCK);
-    _output            = output;
-    _n_batches         = n_batches;
-    _n_rows            = n_rows;
-    _n_cols            = n_cols;
-    _n_channels        = n_channels;
-
-    // We don't have the biases buffer at this stage as it hasn't been allocated, we pass in nullptr OutputTransform is only used here to compute the window
-    OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride, nullptr, _output, _n_batches, _n_rows, _n_cols, _n_channels);
-    Window          win;
-    auto            win_last = output_transform.get_window();
-    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
-    INEKernel::configure(win);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_biases->buffer());
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_output_workspace);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_output);
-
-    OutputTransform output_transform(_output_workspace, _matrix_stride, _matrix_row_stride,
-                                     reinterpret_cast<T *>(_biases->buffer()), _output,
-                                     _n_batches, _n_rows, _n_cols, _n_channels);
-
-    // The code below cannot be moved to configure because biases hasn't been allocated at that point
-    const size_t fst = window.x().start();
-    const size_t lst = window.x().end();
-    output_transform.run(fst, lst);
-}
-
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-bool NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::is_parallelisable() const
-{
-    return false;
-}
-
-template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>;
-template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>;
-
-} // namespace arm_compute

diff --git a/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp b/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp
deleted file mode 100644
index bffcbbf..0000000
--- a/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp
+++ /dev/null

@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-void NEGEMMAArch32Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-
-    _input0          = input0;
-    _input1          = input1;
-    _output          = output;
-    _workspace       = workspace;
-    _alpha           = alpha;
-    _beta            = beta;
-    _is_transposed_0 = is_transposed_0;
-    _is_transposed_1 = is_transposed_1;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info());
-
-    AccessWindowRectangle output_access(output->info(), 0, 0, 8, 6);
-
-    const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 6);
-    const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 8);
-
-    update_window_and_padding(win,
-                              AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
-                              AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
-                              output_access);
-
-    INEKernel::configure(win);
-}
-
-void NEGEMMAArch32Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float);
-    const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float);
-    const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float);
-
-    const auto in1_ptr = reinterpret_cast<const float *>(_input1->buffer());
-
-    const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
-    const int N = _output->info()->tensor_shape().x();
-    const int K = _input0->info()->tensor_shape().x();
-
-    // Only iterate over batches
-    Window win(window);
-    win.set(0, Window::Dimension(0, 1, 1));
-    win.set(1, Window::Dimension(0, 1, 1));
-
-    Iterator in0(_input0, window);
-    Iterator out(_output, window);
-
-    GemmInterleaved<sgemm_8x6, float, float> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1);
-    constexpr size_t alignment      = 4096;
-    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
-    void            *workspace      = _workspace->buffer() + offset;
-    size_t           workspace_size = _workspace->info()->total_size();
-
-    if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
-    {
-        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
-    }
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        gemm.execute(reinterpret_cast<const float *>(in0.ptr()), lda,
-                     reinterpret_cast<const float *>(in1_ptr), ldb,
-                     reinterpret_cast<float *>(out.ptr()), ldc,
-                     _alpha, _beta, workspace);
-    },
-    in0, out);
-}
-} // namespace arm_compute

diff --git a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
deleted file mode 100644
index 0eaa9aa..0000000
--- a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
+++ /dev/null

@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-void NEGEMMAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-
-    _input0          = input0;
-    _input1          = input1;
-    _output          = output;
-    _workspace       = workspace;
-    _alpha           = alpha;
-    _beta            = beta;
-    _is_transposed_0 = is_transposed_0;
-    _is_transposed_1 = is_transposed_1;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info());
-
-    AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8);
-
-    const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8);
-    const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
-
-    update_window_and_padding(win,
-                              AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
-                              AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
-                              output_access);
-
-    INEKernel::configure(win);
-}
-
-void NEGEMMAArch64Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float);
-    const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float);
-    const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float);
-
-    const auto in1_ptr = reinterpret_cast<const float *>(_input1->buffer());
-
-    const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
-    const int N = _output->info()->tensor_shape().x();
-    const int K = _input0->info()->tensor_shape().x();
-
-    // Only iterate over batches
-    Window win(window);
-    win.set(0, Window::Dimension(0, 1, 1));
-    win.set(1, Window::Dimension(0, 1, 1));
-
-    Iterator in0(_input0, window);
-    Iterator out(_output, window);
-
-    GemmInterleaved<sgemm_12x8, float, float> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1);
-    constexpr size_t alignment      = 4096;
-    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
-    void            *workspace      = _workspace->buffer() + offset;
-    size_t           workspace_size = _workspace->info()->total_size();
-
-    if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
-    {
-        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
-    }
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        gemm.execute(reinterpret_cast<const float *>(in0.ptr()), lda,
-                     reinterpret_cast<const float *>(in1_ptr), ldb,
-                     reinterpret_cast<float *>(out.ptr()), ldc,
-                     _alpha, _beta, workspace);
-    },
-    in0, out);
-}
-} // namespace arm_compute

diff --git a/src/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.cpp
deleted file mode 100644
index 0b3212b..0000000
--- a/src/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.cpp
+++ /dev/null

@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/convolution/winograd/gemm.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-void NEGEMMAArch64NativeKernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
-                                                   bool is_transposed_1)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-
-    _input0          = input0;
-    _input1          = input1;
-    _output          = output;
-    _workspace       = workspace;
-    _alpha           = alpha;
-    _beta            = beta;
-    _is_transposed_0 = is_transposed_0;
-    _is_transposed_1 = is_transposed_1;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps(16U, 4U));
-
-    const int input0_access_end_x = ceil_to_multiple(input0->info()->tensor_shape().x(), 4);
-    const int input0_access_end_y = ceil_to_multiple(input0->info()->tensor_shape().y(), 4);
-    const int input1_access_end_x = ceil_to_multiple(input1->info()->tensor_shape().x(), 16);
-
-    AccessWindowStatic    input0_access(input0->info(), 0, 0, input0_access_end_x, input0_access_end_y);
-    AccessWindowStatic    input1_access(input1->info(), 0, 0, input1_access_end_x, input1->info()->tensor_shape().y());
-    AccessWindowRectangle output_access(output->info(), 0, 0, 16, 4);
-    update_window_and_padding(win, input0_access, input1_access, output_access);
-
-    INEKernel::configure(win);
-}
-
-void NEGEMMAArch64NativeKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_UNUSED(info);
-
-    const auto in1_ptr = reinterpret_cast<const float *>(_input1->buffer());
-
-    // Calculate row strides for each matrix
-    const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float);
-    const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float);
-    const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float);
-
-    // Calculate matrix sizes
-    const int M = std::min(_input0->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
-    const int K = _input0->info()->tensor_shape().x();
-    const int N = _input1->info()->tensor_shape().x();
-
-    // Create window (Only iterate over batches)
-    Window win(window);
-    win.set(0, Window::Dimension(0, 1, 1));
-    win.set(1, Window::Dimension(0, 1, 1));
-
-    // Create Iterators
-    Iterator in0(_input0, window);
-    Iterator out(_output, window);
-
-    // Execute GEMM
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        BlockedGemm<4, 16, float, float>(reinterpret_cast<const float *>(in0.ptr()),
-                                         reinterpret_cast<const float *>(in1_ptr),
-                                         reinterpret_cast<float *>(out.ptr()),
-                                         M, K, N,
-                                         lda, ldb, ldc);
-    },
-    in0, out);
-}
-} // namespace arm_compute

diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp
deleted file mode 100644
index 80606dc..0000000
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp
+++ /dev/null

@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-// Enable only if compiled for AArch64-V8A targets
-#ifdef ARM_COMPUTE_AARCH64_V8A
-
-namespace arm_compute
-{
-NEGEMMLowpAArch64A53Kernel::NEGEMMLowpAArch64A53Kernel()
-    : _func(nullptr)
-{
-}
-
-void gemm_interleaved_s16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1,
-                               const Window     &window,
-                               const ThreadInfo &info)
-{
-    const int lda = input0->info()->strides_in_bytes().y();
-    const int ldb = input1->info()->strides_in_bytes().y();
-    const int ldc = output->info()->strides_in_bytes().y() / sizeof(int32_t);
-
-    const auto in1_ptr = reinterpret_cast<const int8_t *>(input1->buffer());
-
-    const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
-    const int N = output->info()->tensor_shape().x();
-    const int K = input0->info()->tensor_shape().x();
-
-    // Only iterate over batches
-    Window win(window);
-    win.set(0, Window::Dimension(0, 1, 1));
-    win.set(1, Window::Dimension(0, 1, 1));
-
-    Iterator in0(input0, window);
-    Iterator out(output, window);
-
-    GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
-
-    constexpr size_t alignment      = 4096;
-    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
-    void            *_workspace     = workspace->buffer() + offset;
-    size_t           workspace_size = workspace->info()->total_size();
-
-    if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
-    {
-        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
-    }
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        gemm.execute(reinterpret_cast<const int8_t *>(in0.ptr()), lda,
-                     reinterpret_cast<const int8_t *>(in1_ptr), ldb,
-                     reinterpret_cast<int32_t *>(out.ptr()), ldc,
-                     alpha, beta, _workspace);
-    },
-    in0, out);
-}
-
-void gemm_interleaved_u16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1,
-                               const Window     &window,
-                               const ThreadInfo &info)
-{
-    const int lda = input0->info()->strides_in_bytes().y();
-    const int ldb = input1->info()->strides_in_bytes().y();
-    const int ldc = output->info()->strides_in_bytes().y() / sizeof(int32_t);
-
-    const auto in1_ptr = reinterpret_cast<const int8_t *>(input1->buffer());
-
-    const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
-    const int N = output->info()->tensor_shape().x();
-    const int K = input0->info()->tensor_shape().x();
-
-    // Only iterate over batches
-    Window win(window);
-    win.set(0, Window::Dimension(0, 1, 1));
-    win.set(1, Window::Dimension(0, 1, 1));
-
-    Iterator in0(input0, window);
-    Iterator out(output, window);
-
-    GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
-
-    constexpr size_t alignment      = 4096;
-    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
-    void            *_workspace     = workspace->buffer() + offset;
-    size_t           workspace_size = workspace->info()->total_size();
-
-    if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
-    {
-        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
-    }
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        gemm.execute(reinterpret_cast<const uint8_t *>(in0.ptr()), lda,
-                     reinterpret_cast<const uint8_t *>(in1_ptr), ldb,
-                     reinterpret_cast<uint32_t *>(out.ptr()), ldc,
-                     alpha, beta, _workspace);
-    },
-    in0, out);
-}
-
-void NEGEMMLowpAArch64A53Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
-                                                    bool is_transposed_1)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-
-    _input0          = input0;
-    _input1          = input1;
-    _output          = output;
-    _workspace       = workspace;
-    _alpha           = alpha;
-    _beta            = beta;
-    _is_transposed_0 = is_transposed_0;
-    _is_transposed_1 = is_transposed_1;
-
-    switch(input0->info()->data_type())
-    {
-        case DataType::S8:
-            _func = &gemm_interleaved_s16_12x8;
-            break;
-        case DataType::U8:
-            _func = &gemm_interleaved_u16_12x8;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Element size not supported");
-            break;
-    }
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info());
-
-    AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8);
-
-    const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 12);
-    const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
-
-    update_window_and_padding(win,
-                              AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
-                              AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
-                              output_access);
-
-    INEKernel::configure(win);
-}
-
-void NEGEMMLowpAArch64A53Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _is_transposed_0, _is_transposed_1, window, info);
-}
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_AARCH64_V8A */

diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp
deleted file mode 100644
index 38f82f0..0000000
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp
+++ /dev/null

@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-// Enable only if compiled for AArch64-V8A targets
-#ifdef ARM_COMPUTE_AARCH64_V8A
-
-namespace arm_compute
-{
-NEGEMMLowpAArch64Kernel::NEGEMMLowpAArch64Kernel()
-    : _func(nullptr)
-{
-}
-
-void gemm_interleaved_s8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, const Window &window,
-                         const ThreadInfo &info)
-{
-    const int lda = input0->info()->strides_in_bytes().y();
-    const int ldb = input1->info()->strides_in_bytes().y();
-    const int ldc = output->info()->strides_in_bytes().y() / sizeof(int32_t);
-
-    const auto in1_ptr = reinterpret_cast<const int8_t *>(input1->buffer());
-
-    const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
-    const int N = output->info()->tensor_shape().x();
-    const int K = input0->info()->tensor_shape().x();
-
-    // Only iterate over batches
-    Window win(window);
-    win.set(0, Window::Dimension(0, 1, 1));
-    win.set(1, Window::Dimension(0, 1, 1));
-
-    Iterator in0(input0, window);
-    Iterator out(output, window);
-
-    GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
-
-    constexpr size_t alignment      = 4096;
-    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
-    void            *_workspace     = workspace->buffer() + offset;
-    size_t           workspace_size = workspace->info()->total_size();
-
-    if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
-    {
-        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
-    }
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        gemm.execute(reinterpret_cast<const int8_t *>(in0.ptr()), lda,
-                     reinterpret_cast<const int8_t *>(in1_ptr), ldb,
-                     reinterpret_cast<int32_t *>(out.ptr()), ldc,
-                     alpha, beta, _workspace);
-    },
-    in0, out);
-}
-
-void gemm_interleaved_u8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, const Window &window,
-                         const ThreadInfo &info)
-{
-    const int lda = input0->info()->strides_in_bytes().y();
-    const int ldb = input1->info()->strides_in_bytes().y();
-    const int ldc = output->info()->strides_in_bytes().y() / sizeof(uint32_t);
-
-    const auto in1_ptr = reinterpret_cast<const uint8_t *>(input1->buffer());
-
-    const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
-    const int N = output->info()->tensor_shape().x();
-    const int K = input0->info()->tensor_shape().x();
-
-    // Only iterate over batches
-    Window win(window);
-    win.set(0, Window::Dimension(0, 1, 1));
-    win.set(1, Window::Dimension(0, 1, 1));
-
-    Iterator in0(input0, window);
-    Iterator out(output, window);
-
-    GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
-
-    constexpr size_t alignment      = 4096;
-    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
-    void            *_workspace     = workspace->buffer() + offset;
-    size_t           workspace_size = workspace->info()->total_size();
-
-    if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
-    {
-        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
-    }
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        gemm.execute(reinterpret_cast<const uint8_t *>(in0.ptr()), lda,
-                     reinterpret_cast<const uint8_t *>(in1_ptr), ldb,
-                     reinterpret_cast<uint32_t *>(out.ptr()), ldc,
-                     alpha, beta, _workspace);
-    },
-    in0, out);
-}
-
-void NEGEMMLowpAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
-                                                 bool is_transposed_1)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::U32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-
-    _input0          = input0;
-    _input1          = input1;
-    _output          = output;
-    _workspace       = workspace;
-    _alpha           = alpha;
-    _beta            = beta;
-    _is_transposed_0 = is_transposed_0;
-    _is_transposed_1 = is_transposed_1;
-
-    switch(input0->info()->data_type())
-    {
-        case DataType::S8:
-            _func = &gemm_interleaved_s8;
-            break;
-        case DataType::U8:
-            _func = &gemm_interleaved_u8;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Element size not supported");
-            break;
-    }
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info());
-
-    AccessWindowRectangle output_access(output->info(), 0, 0, 4, 4);
-
-    const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 4);
-    const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 4);
-
-    update_window_and_padding(win,
-                              AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
-                              AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
-                              output_access);
-
-    INEKernel::configure(win);
-}
-
-void NEGEMMLowpAArch64Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _is_transposed_0, _is_transposed_1, window, info);
-}
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_AARCH64_V8A */

diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
deleted file mode 100644
index d4fcf5e..0000000
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
+++ /dev/null

@@ -1,198 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-// Enable only if compiled for AArch64-V8.2-A targets
-#ifdef ARM_COMPUTE_AARCH64_V8_2
-
-namespace
-{
-using namespace arm_compute;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::U8, DataType::S8);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
-{
-    // Configure kernel window
-    Window win = calculate_max_window(*output);
-
-    AccessWindowRectangle output_access(output, 0, 0, 12, 8);
-
-    const int input0_access_end = ceil_to_multiple(input0->tensor_shape().x(), 8);
-    const int input1_access_end = ceil_to_multiple(input1->tensor_shape().x(), 12);
-
-    bool window_changed = update_window_and_padding(win,
-                                                    AccessWindowStatic(input0, 0, 0, input0_access_end, input0->tensor_shape().y()),
-                                                    AccessWindowStatic(input1, 0, 0, input1_access_end, input1->tensor_shape().y()),
-                                                    output_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-template <typename strategy, typename To, typename Tr>
-void *align_workspace(GemmInterleaved<strategy, To, Tr> &gemm, const ThreadInfo &info, ITensor *ws)
-{
-    constexpr size_t alignment      = 4096;
-    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
-    void            *workspace      = ws->buffer() + offset;
-    size_t           workspace_size = ws->info()->total_size();
-
-    if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
-    {
-        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
-    }
-    return workspace;
-}
-
-template <typename strategy>
-void execute_gemm(const Window &win, Iterator &in0, Iterator &in1, Iterator &out,
-                  const ThreadInfo &info, ITensor *ws, int M, int N, int K, bool is_transposed_0, bool is_transposed_1,
-                  int lda, int ldb, int ldc, float alpha, float beta)
-{
-    ARM_COMPUTE_UNUSED(M);
-    ARM_COMPUTE_UNUSED(N);
-    ARM_COMPUTE_UNUSED(K);
-    ARM_COMPUTE_UNUSED(is_transposed_0);
-    ARM_COMPUTE_UNUSED(is_transposed_1);
-    GemmInterleaved<strategy, typename strategy::operand_type, typename strategy::result_type> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
-    void *workspace = align_workspace(gemm, info, ws);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        gemm.execute(reinterpret_cast<const typename strategy::operand_type *>(in0.ptr()), lda,
-                     reinterpret_cast<const typename strategy::operand_type *>(in1.ptr()), ldb,
-                     reinterpret_cast<typename strategy::result_type *>(out.ptr()), ldc,
-                     alpha, beta, workspace);
-    },
-    in0, out);
-}
-} // namespace
-
-namespace arm_compute
-{
-void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
-                                                     bool is_transposed_1)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
-
-    _input0          = input0;
-    _input1          = input1;
-    _output          = output;
-    _workspace       = workspace;
-    _alpha           = alpha;
-    _beta            = beta;
-    _is_transposed_0 = is_transposed_0;
-    _is_transposed_1 = is_transposed_1;
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-}
-
-Status NEGEMMLowpAArch64V8P4Kernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
-
-    return Status{};
-}
-
-void NEGEMMLowpAArch64V8P4Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    const int lda = _input0->info()->strides_in_bytes().y();
-    const int ldb = _input1->info()->strides_in_bytes().y();
-    const int ldc = _output->info()->strides_in_bytes().y() / sizeof(uint32_t);
-
-    const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
-    const int N = _output->info()->tensor_shape().x();
-    const int K = _input0->info()->tensor_shape().x();
-
-    // Only iterate over batches
-    Window win(window);
-    win.set(0, Window::Dimension(0, 1, 1));
-    win.set(1, Window::Dimension(0, 1, 1));
-
-    Iterator in0(_input0, window);
-    Iterator in1(_input1, window);
-    Iterator out(_output, window);
-
-    switch(_input0->info()->data_type())
-    {
-        case DataType::QASYMM8:
-        case DataType::U8:
-        {
-            execute_gemm<gemm_u8_12x8>(win, in0, in1, out, info, _workspace, M, N, K, _is_transposed_0, _is_transposed_1, lda, ldb, ldc, _alpha, _beta);
-            break;
-        }
-        case DataType::S8:
-        {
-            execute_gemm<gemm_s8_12x8>(win, in0, in1, out, info, _workspace, M, N, K, _is_transposed_0, _is_transposed_1, lda, ldb, ldc, _alpha, _beta);
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Not supported.");
-            break;
-        }
-    }
-}
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */

diff --git a/src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp
deleted file mode 100644
index 163014b..0000000
--- a/src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp
+++ /dev/null

@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wswitch-default"
-#pragma GCC diagnostic ignored "-Weffc++"
-#include "arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp"
-#pragma GCC diagnostic pop
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-void NEGEMVAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-
-    _input0          = input0;
-    _input1          = input1;
-    _output          = output;
-    _workspace       = workspace;
-    _alpha           = alpha;
-    _beta            = beta;
-    _is_transposed_0 = is_transposed_0;
-    _is_transposed_1 = is_transposed_1;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info());
-
-    AccessWindowRectangle output_access(output->info(), 0, 0, 12, 1);
-
-    const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 12);
-    const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
-
-    update_window_and_padding(win,
-                              AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
-                              AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
-                              output_access);
-
-    INEKernel::configure(win);
-}
-
-void NEGEMVAArch64Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    const int lda = _input0->info()->strides_in_bytes().y() / sizeof(sgemv_trans::operand_type);
-    const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(sgemv_trans::operand_type);
-    const int ldc = _output->info()->strides_in_bytes().y() / sizeof(sgemv_trans::result_type);
-
-    const auto in1_ptr = reinterpret_cast<const sgemv_trans::operand_type *>(_input1->buffer());
-
-    const int N = _output->info()->tensor_shape().x();
-    const int K = _input0->info()->tensor_shape().x();
-
-    // Only iterate over batches
-    Window win(window);
-    win.set(0, Window::Dimension(0, 1, 1));
-    win.set(1, Window::Dimension(0, 1, 1));
-
-    Iterator in0(_input0, window);
-    Iterator out(_output, window);
-
-    GemvTransposed<sgemv_trans, sgemv_trans::operand_type, sgemv_trans::result_type> gemm(&info.cpu_info, N, K);
-    constexpr size_t alignment      = 4096;
-    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
-    void            *workspace      = _workspace->buffer() + offset;
-    size_t           workspace_size = _workspace->info()->total_size();
-
-    if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
-    {
-        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
-    }
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        gemm.execute(reinterpret_cast<const sgemv_trans::operand_type *>(in0.ptr()), lda,
-                     reinterpret_cast<const sgemv_trans::operand_type *>(in1_ptr), ldb,
-                     reinterpret_cast<sgemv_trans::result_type *>(out.ptr()), ldc,
-                     _alpha, _beta, workspace);
-    },
-    in0, out);
-}
-} // namespace arm_compute

diff --git a/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp b/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp
deleted file mode 100644
index e84409c..0000000
--- a/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp
+++ /dev/null

@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wswitch-default"
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp"
-#pragma GCC diagnostic pop
-} // namespace arm_compute
-
-namespace arm_compute
-{
-void NEHGEMMAArch64FP16Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
-                                                  bool is_transposed_1)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-
-    _input0          = input0;
-    _input1          = input1;
-    _output          = output;
-    _workspace       = workspace;
-    _alpha           = alpha;
-    _beta            = beta;
-    _is_transposed_0 = is_transposed_0;
-    _is_transposed_1 = is_transposed_1;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info());
-
-    AccessWindowRectangle output_access(output->info(), 0, 0, 24, 8);
-
-    const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8);
-    const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 24);
-
-    update_window_and_padding(win,
-                              AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
-                              AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
-                              output_access);
-
-    INEKernel::configure(win);
-}
-
-void NEHGEMMAArch64FP16Kernel::run(const Window &window, const ThreadInfo &info)
-{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    const int lda = _input0->info()->strides_in_bytes().y() / sizeof(hgemm_24x8::operand_type);
-    const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(hgemm_24x8::operand_type);
-    const int ldc = _output->info()->strides_in_bytes().y() / sizeof(hgemm_24x8::result_type);
-
-    const auto in1_ptr = reinterpret_cast<const hgemm_24x8::operand_type *>(_input1->buffer());
-
-    const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
-    const int N = _output->info()->tensor_shape().x();
-    const int K = _input0->info()->tensor_shape().x();
-
-    // Only iterate over batches
-    Window win(window);
-    win.set(0, Window::Dimension(0, 1, 1));
-    win.set(1, Window::Dimension(0, 1, 1));
-
-    Iterator in0(_input0, window);
-    Iterator out(_output, window);
-
-    GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1);
-    constexpr size_t alignment      = 4096;
-    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
-    void            *workspace      = _workspace->buffer() + offset;
-    size_t           workspace_size = _workspace->info()->total_size();
-
-    if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
-    {
-        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
-    }
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        gemm.execute(reinterpret_cast<const hgemm_24x8::operand_type *>(in0.ptr()), lda,
-                     reinterpret_cast<const hgemm_24x8::operand_type *>(in1_ptr), ldb,
-                     reinterpret_cast<hgemm_24x8::result_type *>(out.ptr()), ldc,
-                     _alpha, 1.f, workspace);
-    },
-    in0, out);
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    ARM_COMPUTE_UNUSED(window);
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-}
-} // namespace arm_compute

diff --git a/src/core/NEON/kernels/arm_gemm/asmlib.hpp b/src/core/NEON/kernels/arm_gemm/asmlib.hpp
new file mode 100644
index 0000000..b3fcb33
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/asmlib.hpp

@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+// Macro to use in assembler to get a preload.  Needed because of various
+// workarounds needed to get working preload behaviour.
+//
+// Code using these macros needs to clobber x20 and x21 as they might be
+// used by the workaround.
+
+// "Correct" version
+#define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n"
+#define ASM_PREFETCHL2(address) "PRFM PLDL2KEEP, " address "\n"
+#define ASM_PREFETCHW(address) "PRFM PSTL1KEEP, " address "\n"
+#define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n"
+
+// Lee's uarchsim hack
+//#define ASM_PREFETCH(address)    "LDNP x20, x21, " address "\n"
+
+// No preload at all
+//#define ASM_PREFETCH(address) ""
+#else
+
+// "Correct" versions for AArch32
+#define ASM_PREFETCH(address) "PLD " address "\n"
+#define ASM_PREFETCHW(address) "PLDW " address "\n"
+
+#endif
+
+/*
+ * Do some prefetches.
+ */
+template <typename T>
+static inline void prefetch_6x(const T *pfp)
+{
+    __asm __volatile(
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        ASM_PREFETCH("[%[pfp], #128]")
+        ASM_PREFETCH("[%[pfp], #192]")
+        ASM_PREFETCH("[%[pfp], #256]")
+        ASM_PREFETCH("[%[pfp], #320]")
+        :
+        : [pfp] "r"(pfp)
+        : "memory");
+}
+
+template <typename T>
+static inline void prefetch_5x(const T *pfp)
+{
+    __asm __volatile(
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        ASM_PREFETCH("[%[pfp], #128]")
+        ASM_PREFETCH("[%[pfp], #192]")
+        ASM_PREFETCH("[%[pfp], #256]")
+        :
+        : [pfp] "r"(pfp)
+        : "memory");
+}
+
+template <typename T>
+static inline void prefetch_4x(const T *pfp)
+{
+    __asm __volatile(
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        ASM_PREFETCH("[%[pfp], #128]")
+        ASM_PREFETCH("[%[pfp], #192]")
+        :
+        : [pfp] "r"(pfp)
+        : "memory");
+}
+
+template <typename T>
+static inline void prefetch_3x(const T *pfp)
+{
+    __asm __volatile(
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        ASM_PREFETCH("[%[pfp], #128]")
+        :
+        : [pfp] "r"(pfp)
+        : "memory");
+}
+
+template <typename T>
+static inline void prefetch_2x(const T *pfp)
+{
+    __asm __volatile(
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        :
+        : [pfp] "r"(pfp)
+        : "memory");
+}
+
+template <typename T>
+static inline void prefetch_1x(const T *pfp)
+{
+    __asm __volatile(
+        ASM_PREFETCH("[%[pfp]]")
+        :
+        : [pfp] "r"(pfp)
+        : "memory");
+}

diff --git a/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp b/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp
new file mode 100644
index 0000000..dd74744
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp

@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+
+#ifndef NO_MULTI_THREADING
+#include <atomic>
+#include <mutex>
+
+#define USE_SEMAPHORE
+
+#ifdef USE_SEMAPHORE
+#include <condition_variable>
+#endif
+
+#endif
+
+namespace arm_gemm
+{
+#ifndef NO_MULTI_THREADING
+enum class BufferStatus
+{
+    IDLE,
+    POPULATING,
+    BUSY
+};
+
+class Buffer
+{
+private:
+    const int   _maxusers; // Maximum permissible threads.
+    void *const _storage;  // Storage for buffer content.
+
+    int _numusers; // Actual number of threads (might be lower).
+
+    volatile BufferStatus _status = BufferStatus::IDLE; // Status
+    std::atomic_int       _users  = {};                 // How many users are still using the buffer.
+    volatile int          _index  = 0;                  // Which block of data currently resides in the buffer.
+
+    std::mutex _lock = {};
+#ifdef USE_SEMAPHORE
+    std::condition_variable _cv = {};
+#endif
+
+    template <typename T>
+    void populate_buffer(T func)
+    {
+        func(_storage);
+
+        /* Now mark it as ready. */
+#ifdef USE_SEMAPHORE
+        {
+            std::unique_lock<std::mutex> ul(_lock);
+            _status = BufferStatus::BUSY;
+            _cv.notify_all();
+        }
+#else
+        _status     = BufferStatus::BUSY;
+#endif
+    }
+
+public:
+    Buffer(Buffer &) = delete;
+    Buffer &operator=(Buffer &) = delete;
+
+    Buffer(void *storage, int maxusers)
+        : _maxusers(maxusers), _storage(storage), _numusers(maxusers)
+    {
+        _status = BufferStatus::IDLE;
+    }
+
+    /* Try and populate the given index.
+     * Wait if the buffer is busy with previous index, then:
+     *
+     * If the buffer is idle, grab it and populate it.
+     * If it's already being populated by another thread or is ready, return.
+     */
+    template <typename T>
+    void try_populate(const int index, T func)
+    {
+        for(;;)
+        {
+#ifdef USE_SEMAPHORE
+            /* If it's busy with a previous index, wait on the semaphore. */
+            if((_status == BufferStatus::BUSY) && (_index != index))
+            {
+                std::unique_lock<std::mutex> ul(_lock);
+
+                if((_status == BufferStatus::BUSY) && (_index != index))
+                {
+                    _cv.wait(ul);
+                }
+            }
+#endif
+            /* Return if another thread is populating it already. */
+            if((_index == index) && ((_status == BufferStatus::POPULATING) || (_status == BufferStatus::BUSY)))
+            {
+                return;
+            }
+
+            if(_status == BufferStatus::IDLE)
+            {
+                std::lock_guard<std::mutex> guard(_lock);
+
+                /* If the buffer is still idle, we can grab it and populate it. */
+                if(_status == BufferStatus::IDLE)
+                {
+                    _status = BufferStatus::POPULATING;
+                    _index  = index;
+                    _users  = _numusers;
+                    break;
+                }
+            }
+        }
+
+        /* If we get here, fill in the buffer. */
+        populate_buffer(func);
+    }
+
+    template <typename T>
+    void *get(const int index, T func)
+    {
+        // Loop until we achieve something.
+        for(;;)
+        {
+            // If the index is correct and the buffer status is busy then we can
+            // just return the content.  No locking is needed here as the index
+            // cannot change (and status cannot change from BUSY) until all
+            // users have finished.
+            if((_index == index) && (_status == BufferStatus::BUSY))
+            {
+                return _storage;
+            }
+#ifdef USE_SEMAPHORE
+            if(((_status == BufferStatus::BUSY) && (_index != index)) || (_status == BufferStatus::POPULATING))
+            {
+                std::unique_lock<std::mutex> ul(_lock);
+
+                if(((_status == BufferStatus::BUSY) && (_index != index)) || (_status == BufferStatus::POPULATING))
+                {
+                    _cv.wait(ul);
+                }
+            }
+#endif
+
+            // If it's idle, we need to populate it.  The IDLE->POPULATING
+            // transition requires the lock.
+            if(_status == BufferStatus::IDLE)
+            {
+                std::lock_guard<std::mutex> guard(_lock);
+
+                /* If it's still idle, grab it.  Otherwise drop through and
+                 * we'll do something else next time through the loop.  */
+                if(_status == BufferStatus::IDLE)
+                {
+                    _status = BufferStatus::POPULATING;
+                    _index  = index;
+                    _users  = _numusers;
+                    break;
+                }
+            }
+        }
+
+        /* If we get here we need to populate the buffer. */
+        populate_buffer(func);
+
+        return _storage;
+    }
+
+    /* Threads call this when they have finished processing a buffer.  We
+     * simply (atomically) decrement the user count, and if it's hit zero we
+     * flag the buffer as idle.
+     */
+    void release(void)
+    {
+        if(--_users == 0)
+        {
+#ifdef USE_SEMAPHORE
+            std::unique_lock<std::mutex> ul(_lock);
+            _status = BufferStatus::IDLE;
+            /* We notify all waiters as we expect one to do the populating
+             * and any others to go and process and earlier block.  */
+            _cv.notify_all();
+#else
+            _status = BufferStatus::IDLE;
+#endif
+        }
+    }
+
+    /* This is called to change the number of users. */
+    void set_numusers(int numusers)
+    {
+        _numusers = std::min(numusers, _maxusers);
+    }
+};
+
+class BufferManager
+{
+private:
+    /* This has to be a vector of Buffer *, because a Buffer cannot be moved
+     * or copied due to atomic members. */
+    std::vector<Buffer *> _buffers = {};
+    const int             _maxthreads;
+    void *const           _storage;
+
+public:
+    BufferManager(BufferManager &) = delete;
+    BufferManager &operator=(BufferManager &) = delete;
+
+    // Say how much storage is needed.
+    static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize)
+    {
+        return buffersize * ((maxthreads == 1) ? 1 : 3);
+    }
+
+    BufferManager(const int maxthreads, const size_t buffersize, void *storage)
+        : _maxthreads(maxthreads), _storage(storage)
+    {
+        const int numbuffers = (maxthreads == 1) ? 1 : 3;
+
+        /* We don't need any Buffer objects in single thread mode. */
+        if(_maxthreads == 1)
+        {
+            return;
+        }
+
+        /* Use intptr_t to avoid performing arithmetic on a void * */
+        intptr_t storage_int = reinterpret_cast<intptr_t>(_storage);
+
+        for(int i = 0; i < numbuffers; i++)
+        {
+            _buffers.push_back(new Buffer(reinterpret_cast<void *>(storage_int), _maxthreads));
+            storage_int += buffersize;
+        }
+    }
+
+    ~BufferManager()
+    {
+        while(_buffers.size())
+        {
+            delete _buffers.back();
+            _buffers.pop_back();
+        }
+    }
+
+    template <typename T>
+    void *get(const int index, T func)
+    {
+        /* In single thread mode, we just directly call the populating
+         * function on the (single) buffer, otherwise forward to the
+         * relevant Buffer.  */
+        if(_maxthreads == 1)
+        {
+            func(_storage);
+            return _storage;
+        }
+        else
+        {
+            return _buffers[index % _buffers.size()]->get(index, func);
+        }
+    }
+
+    template <typename T>
+    void try_populate(const int index, T func)
+    {
+        /* No need for this in single thread mode. */
+        if(_maxthreads == 1)
+        {
+            return;
+        }
+
+        _buffers[index % _buffers.size()]->try_populate(index, func);
+    }
+
+    void release(const int index)
+    {
+        /* No need for this in single thread mode. */
+        if(_maxthreads == 1)
+        {
+            return;
+        }
+
+        _buffers[index % _buffers.size()]->release();
+    }
+
+    void set_nthreads(int threads)
+    {
+        if(_maxthreads == 1)
+        {
+            return;
+        }
+
+        for(unsigned int i = 0; i < _buffers.size(); i++)
+        {
+            _buffers[i]->set_numusers(threads);
+        }
+    }
+};
+
+#else
+
+/* Trivial implementation if threading is disabled at compile time.
+ *
+ * Here, we only need storage for a single buffer.  The 'get' method needs
+ * to call the supplied function to populate the buffer and then return it.
+ * All the other methods do nothing.
+ */
+
+class BufferManager
+{
+private:
+    void *const _storage;
+
+public:
+    BufferManager(BufferManager &) = delete;
+    BufferManager &operator=(BufferManager &) = delete;
+
+    BufferManager(const int maxthreads, const size_t buffersize, void *storage)
+        : _storage(storage)
+    {
+    }
+
+    ~BufferManager()
+    {
+    }
+
+    // Say how much storage is needed.
+    static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize)
+    {
+        return buffersize;
+    }
+
+    template <typename T>
+    void try_populate(const int index, T func)
+    {
+    }
+
+    void release(const int index)
+    {
+    }
+
+    template <typename T>
+    void *get(const int index, T func)
+    {
+        func(_storage);
+        return _storage;
+    }
+
+    void set_nthreads(int)
+    {
+    }
+};
+
+#endif
+
+} // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemm_batched.hpp
new file mode 100644
index 0000000..385358f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_batched.hpp

@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "arm_gemm.hpp"
+
+namespace arm_gemm
+{
+template <typename To, typename Tr>
+class GemmBatched : public GemmCommon<To, Tr>
+{
+private:
+    UniqueGemmCommon<To, Tr> _subgemm = nullptr;
+
+public:
+    GemmBatched(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+                const unsigned int nbatches, const unsigned int nmulti, const bool trA, const bool trB,
+                const To alpha, const To beta, const int maxthreads, const bool pretransposed_hint)
+    {
+        /* Just create a subgemm with batches->M */
+        _subgemm = gemm<To, Tr>(ci, nbatches, N, K, 1, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint);
+    }
+
+    void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
+                    const To *B, const int ldb, const int B_multi_stride,
+                    Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override
+    {
+        /* A and C's batch stride becomes their new row stride.  New batch stride is 0 as nbatches for subgemm is always 1. */
+        _subgemm->set_arrays(A, A_batch_stride, 0, A_multi_stride,
+                             B, ldb, B_multi_stride,
+                             C, C_batch_stride, 0, C_multi_stride);
+    }
+
+    unsigned int get_window_size() const override
+    {
+        return _subgemm->get_window_size();
+    }
+
+    void set_nthreads(int nthreads) override
+    {
+        _subgemm->set_nthreads(nthreads);
+    }
+
+    void execute(unsigned int start, unsigned int end, int threadid) override
+    {
+        _subgemm->execute(start, end, threadid);
+    }
+
+    size_t get_working_size() const override
+    {
+        return _subgemm->get_working_size();
+    }
+
+    void set_working_space(void *space) override
+    {
+        _subgemm->set_working_space(space);
+    }
+
+    bool B_is_pretransposed() const override
+    {
+        return _subgemm->B_is_pretransposed();
+    }
+
+    bool B_pretranspose_required() const override
+    {
+        return _subgemm->B_pretranspose_required();
+    }
+
+    size_t get_B_pretransposed_array_size() const override
+    {
+        return _subgemm->get_B_pretransposed_array_size();
+    }
+
+    void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override
+    {
+        _subgemm->pretranspose_B_array(buffer, B, ldb, B_multi_stride);
+    }
+
+    void set_pretransposed_B_data(void *buffer) override
+    {
+        _subgemm->set_pretransposed_B_data(buffer);
+    }
+};
+
+} // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
new file mode 100644
index 0000000..d1180b1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp

@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// This can only be built if the target/compiler supports FP16 arguments.
+#ifdef __ARM_FP16_ARGS
+
+#include "arm_gemm.hpp"
+
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/a64_hgemm_24x8.hpp"
+#include "kernels/a64_sgemm_12x8.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<__fp16, __fp16> gemm(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+                                      const unsigned int nbatches, const unsigned int nmulti,
+                                      const bool trA, const bool trB, const __fp16 alpha, const __fp16 beta,
+                                      const int maxthreads, const bool pretransposed_hint)
+{
+#ifdef __aarch64__
+
+    // Only consider the native FP16 kernel if it will get built.
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    // If the compiler is configured to enable this feature always, then assume it is available at runtime too.
+    const bool use_fp16 = true;
+#else
+    // Otherwise, detect at runtime via CPUInfo.
+    const bool use_fp16 = ci.has_fp16();
+#endif
+
+    // If FP16 is supported, use it.
+    if(use_fp16)
+    {
+        return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+    }
+#endif
+
+    // Fallback to using the blocked SGEMM kernel.
+    return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<sgemm_12x8, __fp16, __fp16>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+#else
+    // For AArch32, only support the SGEMM route for now.
+    return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<sgemm_8x6, __fp16, __fp16>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+#endif
+}
+
+// Instantiate static class members if necessary.
+#if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
+const int hgemm_24x8::out_width;
+const int hgemm_24x8::out_height;
+#endif
+
+} // namespace arm_gemm
+
+#endif // __ARM_FP16_ARGS

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
new file mode 100644
index 0000000..43df1aa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp

@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_gemm.hpp"
+#include "gemm_batched.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+#include "gemm_native.hpp"
+#include "gemv_native_transposed.hpp"
+#include "gemv_pretransposed.hpp"
+
+#include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/a64_sgemm_12x8.hpp"
+#include "kernels/a64_sgemm_native_16x4.hpp"
+#include "kernels/a64_sgemv_pretransposed.hpp"
+#include "kernels/a64_sgemv_trans.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<float, float> gemm<float, float>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+                                                  const unsigned int nbatches, const unsigned int nmulti,
+                                                  const bool trA, const bool trB, const float alpha, const float beta,
+                                                  const int maxthreads, const bool pretransposed_hint)
+{
+    /* Handle "batched GEMM" */
+    if(M == 1 && nbatches > 1)
+    {
+        return UniqueGemmCommon<float, float>(new GemmBatched<float, float>(ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+    }
+#ifdef __aarch64__
+    /* Cases in priority order */
+    /* GemvPretransposed: requires M=1, alpha=1, and transposed hint set.  nbatches must be 1 or we would have returned above so don't test. */
+    if(M == 1 && alpha == 1.0f && pretransposed_hint)
+    {
+        return UniqueGemmCommon<float, float>(new GemvPretransposed<sgemv_pretransposed, float, float>(&ci, N, K, nmulti, trB, beta));
+    }
+
+    /* GemvNativeTransposed: requires M=1, no trA or trB, doesn't handle alpha */
+    if(M == 1 && alpha == 1.0f && !trA && !trB)
+    {
+        return UniqueGemmCommon<float, float>(new GemvNativeTransposed<sgemv_trans, float, float>(&ci, N, K, nmulti, beta));
+    }
+
+    /* Native GEMM: requires M to be a multiple of 4, K at least 4, N a
+     * multiple of 16, doesn't handle alpha and only makes sense for small
+     * sizes.  */
+    if(N <= 128 && K <= 128 && ((M % 4) == 0) && (K >= 4) && ((N % 16) == 0) && alpha == 1.0f)
+    {
+        return UniqueGemmCommon<float, float>(new GemmNative<sgemm_native_16x4, float, float>(&ci, M, N, K, nbatches, nmulti, beta));
+    }
+
+    /* Blocked GEMM, handles all cases. */
+    return UniqueGemmCommon<float, float>(new GemmInterleaved<sgemm_12x8, float, float>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+#else
+    return UniqueGemmCommon<float, float>(new GemmInterleaved<sgemm_8x6, float, float>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+#endif
+}
+
+// Instantiate static class variables.
+#ifdef __aarch64__
+const int sgemm_12x8::out_width;
+const int sgemm_12x8::out_height;
+
+const int sgemm_native_16x4::out_width;
+const int sgemm_native_16x4::out_height;
+#else
+const int sgemm_8x6::out_width;
+const int sgemm_8x6::out_height;
+#endif
+
+} // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
new file mode 100644
index 0000000..7669fe0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp

@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a64_gemm_s16_12x8.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+                                                          const unsigned int nbatches, const unsigned int nmulti,
+                                                          const bool trA, const bool trB, const int32_t alpha, const int32_t beta,
+                                                          const int maxthreads, const bool pretransposed_hint)
+{
+    return UniqueGemmCommon<int16_t, int32_t>(new GemmInterleaved<gemm_s16_12x8, int16_t, int32_t>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+}
+
+// Instantiate static class members
+const int gemm_s16_12x8::out_width;
+const int gemm_s16_12x8::out_height;
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
new file mode 100644
index 0000000..6016af2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a64_gemm_s16_12x8.hpp"
+#include "kernels/a64_gemm_s8_12x8.hpp"
+#include "kernels/a64_gemm_s8_4x4.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+                                                        const unsigned int nbatches, const unsigned int nmulti,
+                                                        const bool trA, const bool trB, const int32_t alpha, const int32_t beta,
+                                                        const int maxthreads, const bool pretransposed_hint)
+{
+    if(ci.has_dotprod())
+    {
+        // Dot product supporting CPUs.  This family has a special version for A55r1.
+        return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<gemm_s8_12x8, int8_t, int32_t>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+    }
+
+    return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<gemm_s8_4x4, int8_t, int32_t>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+}
+
+// Instantiate static class members
+const int gemm_s8_12x8::out_width;
+const int gemm_s8_12x8::out_height;
+const int gemm_s8_4x4::out_width;
+const int gemm_s8_4x4::out_height;
+
+} // namespace arm_gemm
+
+#endif // aarch64

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
new file mode 100644
index 0000000..efc5171
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp

@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <algorithm>
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+
+#include "buffer_manager.hpp"
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+// Some macros used to decide how much working space to allocate.
+// Round allocations up to the next cache line.
+#define ALLOC_ROUND 64
+#define ROUND_UP(x) ((((x) + ALLOC_ROUND - 1) / ALLOC_ROUND) * ALLOC_ROUND)
+
+// Implementation of the GemmCommon abstract class.
+//
+// This implementation interleaves the source matrices in blocks - good for
+// larger matrices.
+namespace arm_gemm
+{
+template <typename strategy, typename To, typename Tr>
+class GemmInterleaved : public GemmCommon<To, Tr>
+{
+    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::result_type  Tri;
+
+    /* const properties set by constructor */
+    const CPUInfo *const _ci;
+
+    const unsigned int _Msize;
+    const unsigned int _Nsize;
+    const unsigned int _Ksize;
+
+    const unsigned int _nbatches;
+    const unsigned int _nmulti;
+
+    const bool _trA;
+    const bool _trB;
+
+    const Tr _alpha;
+    const Tr _beta;
+
+    const unsigned int _maxthreads;
+    const bool         _pretransposed;
+
+    /* Blocking info */
+    unsigned int _k_block = 0;
+    unsigned int _x_block = 0;
+    unsigned int _Mround  = 0;
+
+    /* Working space, pretransposed buffer, buffer manager */
+    const Toi     *_B_transposed  = nullptr;
+    BufferManager *_bm            = nullptr;
+    void          *_working_space = nullptr;
+
+    /* We will need to walk through the blocks of B in a few contexts, so
+     * factor that out.  */
+    class blockwalker
+    {
+    private:
+        /* Size loops, etc. based on our parent's configuration */
+        const GemmInterleaved<strategy, To, Tr> &_parent;
+
+        /* K and X and multi parameters for current iteration. */
+        unsigned int _k0 = 0, _x0 = 0, _multi = 0;
+
+        unsigned int _index     = 0;
+        bool         _done      = false;
+        bool         _newkblock = true;
+        bool         _newmulti  = true;
+
+    public:
+        blockwalker(const GemmInterleaved<strategy, To, Tr> &parent)
+            : _parent(parent)
+        {
+        }
+
+        unsigned int xmax()
+        {
+            return std::min(_x0 + _parent._x_block, _parent._Nsize);
+        }
+
+        unsigned int kmax()
+        {
+            return std::min(_k0 + _parent._k_block, _parent._Ksize);
+        }
+
+        /* Advance to the next block, return false at the end. */
+        bool advance(void)
+        {
+            if(_done)
+            {
+                return false;
+            }
+
+            _newkblock = false;
+            _x0 += _parent._x_block;
+            if(_x0 >= _parent._Nsize)
+            {
+                _x0 = 0;
+                _k0 += _parent._k_block;
+                if(_k0 >= _parent._Ksize)
+                {
+                    _k0 = 0;
+                    _multi++;
+                    if(_multi >= _parent._nmulti)
+                    {
+                        _done = true;
+                        return false;
+                    }
+                    _newmulti = true;
+                }
+                _newkblock = true;
+            }
+            _index++;
+
+            return true;
+        }
+
+        unsigned int k0(void)
+        {
+            return _k0;
+        }
+        unsigned int x0(void)
+        {
+            return _x0;
+        }
+        unsigned int multi(void)
+        {
+            return _multi;
+        }
+        unsigned int index(void)
+        {
+            return _index;
+        }
+        bool done(void)
+        {
+            return _done;
+        }
+        bool newkblock(void)
+        {
+            return _newkblock;
+        }
+    };
+
+    // A working size: One of these needed, regardless of thread count.  Divided according to window.
+    size_t get_a_working_size() const
+    {
+        return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches);
+    }
+
+    // B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings.
+    size_t get_b_working_size() const
+    {
+        return ROUND_UP(sizeof(Toi) * _x_block * _k_block);
+    }
+
+    // C working size: One needed per thread.
+    size_t get_c_working_size() const
+    {
+        return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height);
+    }
+
+    // Internal execute function.
+    // This supports both the "pretransposed" and "standard" interfaces via the template parameter.
+    template <bool pretransposed>
+    void execute_internal(unsigned int start, unsigned int end, int threadid)
+    {
+#ifdef CYCLE_PROFILING
+        profiler prof;
+#endif
+
+        strategy strat(_ci);
+
+        blockwalker current(*this);
+        blockwalker next = current;
+
+        /* Translate 'start' and 'end' into a position within the batches and rows. */
+        const unsigned int window_per_batch = _Mround / strategy::out_height;
+        unsigned int       batch_0          = start / window_per_batch;
+        unsigned int       batch_end        = end / window_per_batch;
+
+        /* Compute the M values to operate on */
+        unsigned int m_0   = (start - (batch_0 * window_per_batch)) * strategy::out_height;
+        unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height;
+
+        /* Make sure we've been set up correctly. */
+        if(pretransposed)
+        {
+            assert(_B_transposed);
+        }
+        else
+        {
+            assert(_bm);
+        }
+
+        assert(_working_space);
+        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
+
+        // Private buffers.  Treat working_space as an array of C buffers (one per thread) first, followed by the (window-divided) A buffer.
+        // Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.
+        Toi *const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
+        Tri *const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
+
+        // Shared buffers - these come either from BufferManager or _B_transposed.
+        const Toi *b_panel;
+
+        if(pretransposed)
+        {
+            b_panel = _B_transposed;
+        }
+
+        //printf("Starting GEMM loop, x_block=%d, k_block=%d\n", _x_block, _k_block);
+
+        // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
+        int kern_k = 0;
+
+        for(; !current.done(); current.advance())
+        {
+            if(current.newkblock())
+            {
+#ifdef CYCLE_PROFILING
+                auto p = prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height * (current.kmax() - current.k0()) * sizeof(Toi));
+#endif
+                for(unsigned int batch = batch_0; batch <= batch_end; batch++)
+                {
+                    unsigned int first_m = (batch == batch_0) ? m_0 : 0;
+                    unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
+
+                    if(first_m >= last_m)
+                        continue;
+                    if(_trA ^ strategy::A_transpose)
+                    {
+                        Transform<strategy::A_interleave, strategy::A_block, true>(
+                            a_panel + ((batch * _Mround + first_m) * _k_block),
+                            this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
+                            this->_lda, first_m, last_m, current.k0(), current.kmax());
+                    }
+                    else
+                    {
+                        Transform<strategy::A_interleave, strategy::A_block, false>(
+                            a_panel + ((batch * _Mround + first_m) * _k_block),
+                            this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
+                            this->_lda, first_m, last_m, current.k0(), current.kmax());
+                    }
+                }
+
+                // Figure out how many "K" the kernel will actually process.
+                kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll);
+                kern_k *= strat.k_unroll;
+            }
+
+            int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width);
+
+            if(!pretransposed)
+            {
+                /* Look ahead to the next block and populate it if necessary.
+                 * This avoids the populate operation becoming a bottleneck, and
+                 * helps keep the threads synchronized (the first thread to get
+                 * here will populate while the rest will advance).
+                 *
+                 * If we are running single threaded, bm->try_populate() will do
+                 * nothing.
+                 */
+                if(next.advance())
+                {
+                    _bm->try_populate(next.index(), [&](void *buffer)
+                    {
+#ifdef CYCLE_PROFILING
+                        auto p = prof.ScopedProfiler(PROFILE_PREPB, (next.xmax() - next.x0()) * (next.kmax() - next.k0()) * sizeof(Toi));
+#endif
+
+                        Toi *b_panel = reinterpret_cast<Toi *>(buffer);
+                        if(_trB ^ strategy::B_transpose)
+                        {
+                            Transform<strategy::B_interleave, strategy::B_block, true>(
+                                b_panel, this->_Bptr + (next.multi() * this->_B_multi_stride), this->_ldb,
+                                next.x0(), next.xmax(), next.k0(), next.kmax());
+                        }
+                        else
+                        {
+                            Transform<strategy::B_interleave, strategy::B_block, false>(
+                                b_panel, this->_Bptr + (next.multi() * this->_B_multi_stride), this->_ldb,
+                                next.x0(), next.xmax(), next.k0(), next.kmax());
+                        }
+                    });
+                }
+                /* Get the buffer for this iteration from the BufferManager. */
+                b_panel = reinterpret_cast<Toi *>(_bm->get(current.index(), [&](void *bpv)
+                {
+#ifdef CYCLE_PROFILING
+                    auto p = prof.ScopedProfiler(PROFILE_PREPB, (current.xmax() - current.x0()) * (current.kmax() - current.k0()) * sizeof(Toi));
+#endif
+
+                    Toi *b_panel = reinterpret_cast<Toi *>(bpv);
+                    if(_trB ^ strategy::B_transpose)
+                    {
+                        Transform<strategy::B_interleave, strategy::B_block, true>(
+                            b_panel, this->_Bptr + (current.multi() * this->_B_multi_stride), this->_ldb,
+                            current.x0(), current.xmax(), current.k0(), current.kmax());
+                    }
+                    else
+                    {
+                        Transform<strategy::B_interleave, strategy::B_block, false>(
+                            b_panel, this->_Bptr + (current.multi() * this->_B_multi_stride), this->_ldb,
+                            current.x0(), current.xmax(), current.k0(), current.kmax());
+                    }
+
+                }));
+            }
+
+            /* Do the actual work. */
+            for(unsigned int batch = batch_0; batch <= batch_end; batch++)
+            {
+                unsigned int first_m = (batch == batch_0) ? m_0 : 0;
+                unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
+
+                const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;
+
+                if(first_m >= last_m)
+                    continue;
+
+                for(unsigned int y = first_m; y < last_m; y += strategy::out_height)
+                {
+                    unsigned int ymax = std::min(_Msize, y + strategy::out_height);
+
+                    {
+#ifdef CYCLE_PROFILING
+                        auto p = prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height * bblocks * strategy::out_width * kern_k));
+#endif
+
+                        strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+
+                        a_ptr += (strategy::out_height * kern_k);
+                    }
+
+                    {
+#ifdef CYCLE_PROFILING
+                        auto p = prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height * bblocks * strategy::out_width * sizeof(Tr)));
+#endif
+                        MergeResults<strategy::out_width, strategy::out_height>(
+                            this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride),
+                            c_panel, this->_ldc, y, ymax, current.x0(), current.xmax(),
+                            _alpha, (current.k0() == 0 ? _beta : static_cast<Tr>(1)));
+                    }
+                }
+            }
+
+            if(pretransposed)
+            {
+                b_panel += (bblocks * strat.out_width * kern_k);
+            }
+            else
+            {
+                _bm->release(current.index());
+            }
+        }
+    }
+
+public:
+    GemmInterleaved(GemmInterleaved &) = delete;
+    GemmInterleaved &operator=(GemmInterleaved &) = delete;
+
+    /* Constructor */
+    GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K,
+                    const unsigned int nbatches, const unsigned int nmulti, const bool trA, const bool trB,
+                    const Tr alpha, const Tr beta, const int maxthreads, const bool pretransposed)
+        : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti), _trA(trA), _trB(trB), _alpha(alpha), _beta(beta), _maxthreads(maxthreads), _pretransposed(pretransposed)
+    {
+        const unsigned int L1_size = ci->get_L1_cache_size();
+        const unsigned int L2_size = ci->get_L2_cache_size();
+
+        assert(maxthreads > 0);
+
+        // Work out blocking parameters
+
+        // k_block: Find out how much of the larger array can be loaded into half the cache.
+        // This should account for associative caches.
+        _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width, strategy::out_height)));
+
+        // Needs to be (at least a single) multiple of the K unroll level.
+        _k_block /= strategy::k_unroll;
+        _k_block = std::max(_k_block, 1U) * strategy::k_unroll;
+
+        // Now tune to presented problem size; this is how many blocks we need.
+        int num_k_blocks = iceildiv(K, _k_block);
+
+        // So divide the space equally into that many blocks.
+        _k_block = iceildiv(K, num_k_blocks);
+
+        // And round UP to the K unroll level required.
+        _k_block = iceildiv(_k_block, strategy::k_unroll);
+        _k_block *= strategy::k_unroll;
+
+        // x_block: Work out how many rows (of length k_block) will fit in the L2
+        // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+        _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width + strategy::out_height))) / (sizeof(Toi) * _k_block);
+
+        // Needs to be (at least a single) multiple of the kernel output width.
+        _x_block /= strategy::out_width;
+        _x_block = std::max(_x_block, 1U) * strategy::out_width;
+
+        // And tune to the presented problem size.
+        int num_x_blocks = iceildiv(N, _x_block);
+        _x_block         = iceildiv(N, num_x_blocks);
+
+        _x_block = iceildiv(_x_block, strategy::out_width);
+        _x_block *= strategy::out_width;
+
+        // Work out the rounded size of M - needed for some buffers.
+        _Mround = iceildiv(M, strategy::out_height);
+        _Mround *= strategy::out_height;
+    }
+
+    // Interface implementation - Compulsory functions
+
+    // Window size: Only the last thread should do a ragged block, so dole
+    // out work in units of out_height.  Factor batches into the window, but
+    // not multi for now (as this would cause problems with the buffer
+    // manager).
+
+    unsigned int get_window_size() const override
+    {
+        // _Mround is a multiple of out_height by definition.
+        return (_Mround / strategy::out_height) * _nbatches;
+    }
+
+    // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
+    void set_nthreads(int nthreads) override
+    {
+        if(_bm)
+        {
+            _bm->set_nthreads(nthreads);
+        }
+    }
+
+    // Execute
+    void execute(unsigned int start, unsigned int end, int threadid) override
+    {
+        if(_pretransposed)
+        {
+            execute_internal<true>(start, end, threadid);
+        }
+        else
+        {
+            execute_internal<false>(start, end, threadid);
+        }
+    }
+
+    // Interface implementation - working space
+    size_t get_working_size() const override
+    {
+        // In all cases, we need one A buffer plus a C buffer per thread.
+        size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads);
+
+        // For pretransposed case, there is no working space needed for B.
+        // Otherwise, we need a BufferManager.
+        if(!_pretransposed)
+        {
+            size += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
+        }
+
+        size += 64; // Add on a cache line extra for alignment.
+
+        return size;
+    }
+
+    void set_working_space(void *working_space) override
+    {
+        // Make sure everything ends up cache line aligned
+        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
+        intptr_t working_space_int   = reinterpret_cast<intptr_t>(working_space);
+
+        size_t diff = 0;
+
+        if(working_space_int & 0x3F)
+        {
+            diff = 0x40 - (working_space_int & 0x3F);
+        }
+
+        working_space_bytes += diff;
+
+        if(_pretransposed)
+        {
+            // Pretransposed case: just set internal pointer to parameter value.
+            _working_space = reinterpret_cast<void *>(working_space_bytes);
+        }
+        else
+        {
+            // Otherwise, use the first part of the working space for the buffer manager.
+            // It's legal to call this again so don't leak a buffer manager if it already existed.
+            delete _bm;
+
+            _bm = new BufferManager(_maxthreads, get_b_working_size(), reinterpret_cast<void *>(working_space_bytes));
+
+            working_space_bytes += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
+
+            _working_space = reinterpret_cast<void *>(working_space_bytes);
+        }
+    }
+
+    // Interface implementation - pretransposed
+    bool B_is_pretransposed() const override
+    {
+        return _pretransposed;
+    }
+
+    bool B_pretranspose_required() const override
+    {
+        return _pretransposed && (_B_transposed == nullptr);
+    }
+
+    size_t get_B_pretransposed_array_size() const override
+    {
+        size_t      total = 0;
+        blockwalker current(*this);
+
+        do
+        {
+            /* Figure out the size of each block. */
+            size_t x_size = (current.xmax() - current.x0());
+            size_t k_size = (current.kmax() - current.k0());
+
+            /* Round sizes up as needed. */
+            x_size = iceildiv(x_size, strategy::out_width);
+            x_size *= strategy::out_width;
+
+            k_size = iceildiv(k_size, strategy::k_unroll);
+            k_size *= strategy::k_unroll;
+
+            total += x_size * k_size * sizeof(Toi);
+        }
+        while(current.advance());
+
+        return total;
+    }
+
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override
+    {
+        blockwalker current(*this);
+        Toi        *buffer = reinterpret_cast<Toi *>(in_buffer);
+        _B_transposed      = buffer;
+
+        do
+        {
+            /* Figure out the size of each block. */
+            size_t x_size = (current.xmax() - current.x0());
+            size_t k_size = (current.kmax() - current.k0());
+
+            /* Round sizes up as needed. */
+            x_size = iceildiv(x_size, strategy::out_width);
+            x_size *= strategy::out_width;
+
+            k_size = iceildiv(k_size, strategy::k_unroll);
+            k_size *= strategy::k_unroll;
+
+            if(_trB ^ strategy::B_transpose)
+            {
+                Transform<strategy::B_interleave, strategy::B_block, true>(
+                    buffer, B + (current.multi() * B_multi_stride), ldb,
+                    current.x0(), current.xmax(), current.k0(), current.kmax());
+            }
+            else
+            {
+                Transform<strategy::B_interleave, strategy::B_block, false>(
+                    buffer, B + (current.multi() * B_multi_stride), ldb,
+                    current.x0(), current.xmax(), current.k0(), current.kmax());
+            }
+
+            buffer += (x_size * k_size);
+        }
+        while(current.advance());
+    }
+
+    void set_pretransposed_B_data(void *in_buffer) override
+    {
+        _B_transposed = reinterpret_cast<Toi *>(in_buffer);
+    }
+
+    ~GemmInterleaved() override
+    {
+        delete _bm;
+    }
+};
+
+} // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
new file mode 100644
index 0000000..075ab82
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp

@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <stdio.h>
+
+#include "arm_gemm.hpp"
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_gemm
+{
+// Implementation of the GemmCommon abstract class.
+//
+// This is implementation is for native GEMM with no transposition.
+//
+// By default the source data is used in-place, but if type conversion is
+// needed we need to allocate working space (CURRENTLY NOT IMPLEMENTED).
+
+template <typename strategy, typename To, typename Tr>
+class GemmNative : public GemmCommon<To, Tr>
+{
+    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::result_type  Tri;
+
+    const unsigned int _Msize;
+    const unsigned int _Nsize;
+    const unsigned int _Ksize;
+
+    const unsigned int _nbatches;
+    const unsigned int _nmultis;
+
+    Tr _beta;
+
+    const CPUInfo *const _ci;
+
+    unsigned int k_block = 0;
+    unsigned int n_block = 0;
+
+public:
+    GemmNative(GemmNative &) = delete;
+    GemmNative &operator=(GemmNative &) = delete;
+
+    GemmNative(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const unsigned int nbatches, const unsigned int nmultis, const Tr beta)
+        : _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmultis(nmultis), _beta(beta), _ci(ci)
+    {
+        /* For now don't do any blocking.*/
+        k_block = K;
+        n_block = N;
+    }
+
+    // Window is number of out_height blocks
+    unsigned int get_window_size() const override
+    {
+        return iceildiv(_Msize, strategy::out_height) * _nbatches * _nmultis;
+    }
+
+    // Actually execute the GEMM.
+    void execute(unsigned int start, unsigned int end, int) override
+    {
+#ifdef CYCLE_PROFILING
+        profiler prof;
+#endif
+        strategy           strat(_ci);
+        const unsigned int window_per_batch = iceildiv(_Msize, strategy::out_height);
+        const unsigned int window_per_multi = window_per_batch * _nbatches;
+
+        const unsigned int first_multi = start / window_per_multi;
+        const unsigned int last_multi  = end / window_per_multi;
+
+        const unsigned int first_batch = (start - (first_multi * window_per_multi)) / window_per_batch;
+        const unsigned int last_batch  = (end - (last_multi * window_per_multi)) / window_per_batch;
+
+        const unsigned int first_row = ((start - (first_multi * window_per_multi)) % window_per_batch) * strategy::out_height;
+        const unsigned int last_row  = ((end - (last_multi * window_per_multi)) % window_per_batch) * strategy::out_height;
+
+        static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
+        static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
+
+        for(unsigned int multi = first_multi; multi <= last_multi; multi++)
+        {
+            const unsigned int batch_0   = (multi == first_multi) ? first_batch : 0;
+            const unsigned int batch_max = (multi == last_multi) ? last_batch : _nbatches - 1;
+
+            for(unsigned int batch = batch_0; batch <= batch_max; batch++)
+            {
+                const unsigned int m_start = ((multi == first_multi) && (batch == first_batch)) ? first_row : 0;
+                const unsigned int m_end   = ((multi == last_multi) && (batch == last_batch)) ? last_row : _Msize;
+
+                for(unsigned int y0 = m_start; y0 < m_end; y0 += strategy::out_height)
+                {
+                    const unsigned int ymax = std::min(y0 + strategy::out_height, m_end);
+#ifdef CYCLE_PROFILING
+                    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (ymax - y0) * _Nsize * _Ksize);
+#endif
+
+                    strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (y0 * this->_lda), this->_lda,
+                                 this->_Bptr + (multi * this->_B_multi_stride), this->_ldb,
+                                 this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (y0 * this->_ldc), this->_ldc,
+                                 _beta, (ymax - y0), _Nsize, _Ksize);
+                }
+            }
+        }
+    }
+};
+
+} // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
new file mode 100644
index 0000000..8f1f377
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp

@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a64_gemm_u16_12x8.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+                                                              const unsigned int nbatches, const unsigned int nmulti,
+                                                              const bool trA, const bool trB, uint32_t alpha, uint32_t beta,
+                                                              const int maxthreads, const bool pretransposed_hint)
+{
+    return UniqueGemmCommon<uint16_t, uint32_t>(new GemmInterleaved<gemm_u16_12x8, uint16_t, uint32_t>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+}
+
+// Instantiate static class members
+const int gemm_u16_12x8::out_width;
+const int gemm_u16_12x8::out_height;
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
new file mode 100644
index 0000000..12e5aa6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp

@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a64_gemm_u8_12x8.hpp"
+#include "kernels/a64_gemm_u8_4x4.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+                                                            const unsigned int nbatches, const unsigned int nmulti,
+                                                            const bool trA, const bool trB, const uint32_t alpha, const uint32_t beta,
+                                                            const int maxthreads, const bool pretransposed_hint)
+{
+    if(ci.has_dotprod())
+    {
+        // Dot product supporting CPUs.  This family has a special version for A55r1.
+        return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<gemm_u8_12x8, uint8_t, uint32_t>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+    }
+
+    // Non dot-product code.
+    return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+}
+
+// Instantiate static class members
+const int gemm_u8_12x8::out_width;
+const int gemm_u8_12x8::out_height;
+
+const int gemm_u8_4x4::out_width;
+const int gemm_u8_4x4::out_height;
+
+} // namespace arm_gemm
+
+#endif // aarch64

diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
new file mode 100644
index 0000000..63bb58a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp

@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <stdio.h>
+
+#include "arm_gemm.hpp"
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_gemm
+{
+// Implementation of the GemmCommon abstract class.
+//
+// This is implementation is for a "native" (no-transform) GEMV with a
+// transposed matrix.
+//
+// As a native operation the source data is used in-place, so the internal
+// and external operand/result types must match.
+template <typename strategy, typename To, typename Tr>
+class GemvNativeTransposed : public GemmCommon<To, Tr>
+{
+    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::result_type  Tri;
+
+    const unsigned int _Nsize;
+    const unsigned int _Ksize;
+    const unsigned int _nmultis;
+
+    const Tr _beta;
+
+    const CPUInfo *const _ci;
+
+    unsigned int m_block = 0;
+    unsigned int n_block = 0;
+
+public:
+    GemvNativeTransposed(GemvNativeTransposed &) = delete;
+    GemvNativeTransposed &operator=(GemvNativeTransposed &) = delete;
+
+    GemvNativeTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const Tr beta)
+        : _Nsize(N), _Ksize(K), _nmultis(nmultis), _beta(beta), _ci(ci)
+    {
+        /* For now don't do any blocking.*/
+        m_block = K;
+        n_block = N;
+    }
+
+    // Window is number of out_width blocks times number of multis.
+    unsigned int get_window_size() const override
+    {
+        return iceildiv(_Nsize, strategy::out_width) * _nmultis;
+    }
+
+    // Actually execute the GEMV.
+    void execute(unsigned int start, unsigned int end, int) override
+    {
+#ifdef CYCLE_PROFILING
+        profiler prof;
+#endif
+
+        strategy strat(_ci);
+
+        const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width);
+        const unsigned int multi_0          = start / window_per_multi;
+        const unsigned int multi_end        = end / window_per_multi;
+
+        const unsigned int n_0   = (start - (multi_0 * window_per_multi)) * strategy::out_width;
+        const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width;
+
+        static_assert(std::is_same<To, Toi>::value, "gemv_transposed: Operand types must be the same.");
+        static_assert(std::is_same<Tr, Tri>::value, "gemv_transposed: Result types must be the same.");
+
+        for(unsigned int multi = multi_0; multi <= multi_end; multi++)
+        {
+            const unsigned int n_start = (multi == multi_0) ? n_0 : 0;
+            const unsigned int n_end   = (multi == multi_end) ? n_max : _Nsize;
+
+            if(n_end <= n_start)
+                continue;
+
+            for(unsigned int m0 = 0; m0 < _Ksize; m0 += m_block)
+            {
+                unsigned int mmax = std::min(m0 + m_block, _Ksize);
+                for(unsigned int n0 = n_start; n0 < n_end; n0 += n_block)
+                {
+                    unsigned int nmax = std::min(n0 + n_block, n_end);
+#ifdef CYCLE_PROFILING
+                    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax - m0) * (nmax - n0));
+#endif
+                    strat.kernel(this->_Bptr + (multi * this->_B_multi_stride) + (m0 * this->_ldb) + n0,
+                                 this->_Aptr + (multi * this->_A_multi_stride) + m0,
+                                 this->_Cptr + (multi * this->_C_multi_stride) + n0,
+                                 _beta, this->_ldb, (mmax - m0), (nmax - n0));
+                }
+            }
+        }
+    }
+};
+
+} // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
new file mode 100644
index 0000000..79f1359
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp

@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <stdio.h>
+
+#include "arm_gemm.hpp"
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_gemm
+{
+// Implementation of the GemmCommon abstract class.
+//
+// This is implementation is for GEMV with pretransposition.
+// batches are not supported as a batched GEMV makes no sense (can be converted to a GEMM).
+
+template <typename strategy, typename To, typename Tr>
+class GemvPretransposed : public GemmCommon<To, Tr>
+{
+    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::result_type  Tri;
+
+    const unsigned int _Nsize;
+    const unsigned int _Ksize;
+    const unsigned int _nmultis;
+
+    const bool _trB;
+
+    const Tr _beta;
+
+    const CPUInfo *const _ci;
+    const unsigned int   _buffer_per_multi;
+
+    unsigned int m_block = 0;
+    unsigned int n_block = 0;
+
+    const Toi *_A_pretransposed = nullptr;
+
+public:
+    GemvPretransposed(GemvPretransposed &) = delete;
+    GemvPretransposed &operator=(GemvPretransposed &) = delete;
+
+    GemvPretransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const bool trB, const Tr beta)
+        : _Nsize(N), _Ksize(K), _nmultis(nmultis), _trB(trB), _beta(beta), _ci(ci), _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave)
+    {
+        /* For now don't do any blocking.*/
+        m_block = K;
+        n_block = N;
+    }
+
+    // Window is number of out_width blocks, times number of multis.
+    unsigned int get_window_size() const override
+    {
+        return iceildiv(_Nsize, strategy::out_width) * _nmultis;
+    }
+
+    // Actually execute the GEMV.
+    void execute(unsigned int start, unsigned int end, int) override
+    {
+#ifdef CYCLE_PROFILING
+        profiler prof;
+#endif
+
+        strategy strat(_ci);
+
+        /* Break the window values down into multis of interest... */
+        const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width);
+        const unsigned int multi_0          = start / window_per_multi;
+        const unsigned int multi_end        = end / window_per_multi;
+
+        /* ... and figure out where we start and end in the first and last multi. */
+        const unsigned int n_0   = (start - (multi_0 * window_per_multi)) * strategy::out_width;
+        const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width;
+
+        static_assert(std::is_same<Tr, Tri>::value, "GemvPretransposed: Result types must be the same.");
+
+        for(unsigned int multi = multi_0; multi <= multi_end; multi++)
+        {
+            const unsigned int n_start = (multi == multi_0) ? n_0 : 0;
+            const unsigned int n_end   = (multi == multi_end) ? n_max : _Nsize;
+
+            if(n_end <= n_start)
+                continue;
+
+            for(unsigned int m0 = 0; m0 < _Ksize; m0 += m_block)
+            {
+                unsigned int mmax = std::min(m0 + m_block, _Ksize);
+                for(unsigned int n = n_start; n < n_end; n += n_block)
+                {
+                    unsigned int nmax = std::min(n + n_block, n_end);
+#ifdef CYCLE_PROFILING
+                    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax - m0) * (nmax - n));
+#endif
+                    /* This assumes that the underlying call was a GEMM with M=1; for the N=1 case we would have to pick up this->_Bptr below instead */
+                    strat.kernel(_A_pretransposed + (multi * _buffer_per_multi) + (n * _Ksize) + (m0 * strategy::A_interleave),
+                                 (_Ksize * strategy::A_interleave),
+                                 this->_Aptr + (multi * this->_A_multi_stride) + m0,
+                                 this->_Cptr + (multi * this->_C_multi_stride) + n,
+                                 _beta, (mmax - m0), (nmax - n));
+                }
+            }
+        }
+    }
+
+    /* Pretransposed interface implementation */
+    bool B_is_pretransposed() const override
+    {
+        return true;
+    }
+
+    bool B_pretranspose_required() const override
+    {
+        /* Transpose is required if _A_pretransposed is still nullptr */
+        return (_A_pretransposed == nullptr);
+    }
+
+    size_t get_B_pretransposed_array_size() const override
+    {
+        return _buffer_per_multi * _nmultis * sizeof(To);
+    }
+
+    void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override
+    {
+        Toi *A_buffer = reinterpret_cast<Toi *>(buffer);
+
+        for(unsigned int multi = 0; multi < _nmultis; multi++)
+        {
+            /* Reverse sense here as we are dealing with B rather than A.  So if
+             * strategy::A_transpose is false and _trB is false, we still
+             * transpose.  */
+            if(_trB ^ strategy::A_transpose)
+            {
+                Transform<strategy::A_interleave, strategy::A_block, false>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
+            }
+            else
+            {
+                Transform<strategy::A_interleave, strategy::A_block, true>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
+            }
+        }
+
+        _A_pretransposed = A_buffer;
+    }
+
+    void set_pretransposed_B_data(void *buffer) override
+    {
+        _A_pretransposed = reinterpret_cast<Toi *>(buffer);
+    }
+};
+
+} // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
new file mode 100644
index 0000000..de11dc5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a32_sgemm_8x6(const float *, const float *, float *, int, int, int);
+void a32_sgemm_8x6_a53(const float *, const float *, float *, int, int, int);
+void a32_sgemm_8x6_a55r1(const float *, const float *, float *, int, int, int);
+
+// 8x6 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics.  The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class sgemm_8x6
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int A_interleave = 6;
+    static const int A_block      = 1;
+    static const int A_transpose  = 0;
+
+    /* Same for B input */
+    static const int B_interleave = 8;
+    static const int B_block      = 1;
+    static const int B_transpose  = 1;
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 8;
+    static const int out_height = 6;
+    static const int k_unroll   = 1;
+
+    kern_type kernel = a32_sgemm_8x6;
+
+    sgemm_8x6(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model())
+        {
+            case CPUModel::A53:
+                kernel = a32_sgemm_8x6_a53;
+                break;
+
+            case CPUModel::A55r1:
+                kernel = a32_sgemm_8x6_a55r1;
+                break;
+
+            default:
+                kernel = a32_sgemm_8x6;
+                break;
+        }
+    }
+};
+
+} // namespace arm_gemm
+#endif // __arm__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp
new file mode 100644
index 0000000..428498f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp

@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 8x6), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a32_sgemm_8x6_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+    const float *a_ptr = Apanel;
+    float       *c_ptr = Cpanel;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr     = a_ptr0;
+            int tails = (K & 3);
+            if(tails == 0)
+            {
+                tails = 4;
+            }
+            int k = ((K + 3) / 4) - 1;
+
+            __asm __volatile(
+                "vmov.i32    q4, #0\n"
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]\n"
+                "vmov.i32    q5, #0\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]\n"
+                "vmov.i32    q6, #0\n"
+                "ldr        r0, [%[a_ptr], #0x10]\n"
+                "vmov.i32    q7, #0\n"
+                "ldr        r1, [%[a_ptr], #0x14]\n"
+                "vmov.i32    q8, #0\n" ASM_PREFETCH("[%[a_ptr], #0x40]") "vmov.i32    q9, #0\n" ASM_PREFETCH("[%[b_ptr], #0x40]") "vmov.i32    q10, #0\n" ASM_PREFETCH("[%[a_ptr], #0x80]") "vmov.i32    q11, #0\n"
+                ASM_PREFETCH("[%[b_ptr], #0x80]")
+                "vmov.i32    q12, #0\n"
+                "vmov.i32    q13, #0\n" ASM_PREFETCH("[%[a_ptr], #0xC0]") "vmov.i32    q14, #0\n" ASM_PREFETCH("[%[b_ptr], #0XC0]")
+                "vmov.i32    q15, #0\n"
+                "cmp        %[k], #0\n"
+                "beq        6f\n"
+
+                "1:\n"
+                // Unroll 0
+                "vldr        d6, [%[b_ptr], #0x10]\n"
+                "vmov        d2, r0, r1\n"
+                "vmla.f32    q4, q2, d0[0]\n"
+                "ldr        r0, [%[b_ptr], #0x18]\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "ldr        r1, [%[b_ptr], #0x1C]\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+
+                "vldr        d3, [%[a_ptr], #0x18]\n"
+                "vmov        d7, r0, r1\n"
+                "vmla.f32    q7, q2, d1[1]\n" ASM_PREFETCH("[%[a_ptr], #0x100]")
+                "vmla.f32    q8, q2, d2[0]\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+
+                "vldr        d4, [%[b_ptr], #0x20]\n"
+                "vmla.f32    q10, q3, d0[0]\n"
+                "ldr        r0, [%[b_ptr], #0x28]\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "ldr        r1, [%[b_ptr], #0x2C]\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+
+                "vldr        d0, [%[a_ptr], #0x20]\n"
+                "vmov        d5, r0, r1\n"
+                "vmla.f32    q13, q3, d1[1]\n"
+                "ldr        r0, [%[a_ptr], #0x28]\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+                "ldr        r1, [%[a_ptr], #0x2C]\n"
+                "vmla.f32    q15, q3, d2[1]\n"
+
+                // Unroll 1
+                "vldr        d6, [%[b_ptr], #0x30]\n"
+                "vmov        d1, r0, r1\n"
+                "vmla.f32    q4, q2, d3[0]\n"
+                "ldr        r0, [%[b_ptr], #0x38]\n"
+                "vmla.f32    q5, q2, d3[1]\n"
+                "ldr        r1, [%[b_ptr], #0x3C]\n"
+                "vmla.f32    q6, q2, d0[0]\n"
+
+                "vldr        d2, [%[a_ptr], #0x30]\n"
+                "vmov        d7, r0, r1\n"
+                "vmla.f32    q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #0x100]")
+                "vmla.f32    q8, q2, d1[0]\n"
+                "vmla.f32    q9, q2, d1[1]\n"
+
+                "vldr        d4, [%[b_ptr], #0x40]\n"
+                "vmla.f32    q10, q3, d3[0]\n"
+                "ldr        r0, [%[b_ptr], #0x48]\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "ldr        r1, [%[b_ptr], #0x4C]\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+
+                "vldr        d3, [%[a_ptr], #0x38]\n"
+                "vmov        d5, r0, r1\n"
+                "vmla.f32    q13, q3, d0[1]\n"
+                "ldr        r0, [%[a_ptr], #0x40]\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+                "ldr        r1, [%[a_ptr], #0x44]\n"
+                "vmla.f32    q15, q3, d1[1]\n"
+
+                // Unroll 2
+                "vldr        d6, [%[b_ptr], #0x50]\n"
+                "vmov        d0, r0, r1\n"
+                "vmla.f32    q4, q2, d2[0]\n"
+                "ldr        r0, [%[b_ptr], #0x58]\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "ldr        r1, [%[b_ptr], #0x5C]\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+
+                "vldr        d1, [%[a_ptr], #0x48]\n"
+                "vmov        d7, r0, r1\n"
+                "vmla.f32    q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #0x140]")
+                "vmla.f32    q8, q2, d0[0]\n"
+                "vmla.f32    q9, q2, d0[1]\n"
+
+                "vldr        d4, [%[b_ptr], #0x60]\n"
+                "vmla.f32    q10, q3, d2[0]\n"
+                "ldr        r0, [%[b_ptr], #0x68]\n"
+                "vmla.f32    q11, q3, d2[1]\n"
+                "ldr        r1, [%[b_ptr], #0x6C]\n"
+                "vmla.f32    q12, q3, d3[0]\n"
+
+                "vldr        d2, [%[a_ptr], #0x50]\n"
+                "vmov        d5, r0, r1\n"
+                "vmla.f32    q13, q3, d3[1]\n"
+                "ldr        r0, [%[a_ptr], #0x58]\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "ldr        r1, [%[a_ptr], #0x5C]\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+                "add        %[a_ptr], %[a_ptr], #0x60\n"
+
+                // Unroll 3
+                "vldr        d6, [%[b_ptr], #0x70]\n"
+                "vmov        d3, r0, r1\n"
+                "vmla.f32    q4, q2, d1[0]\n"
+                "ldr        r0, [%[b_ptr], #0x78]\n"
+                "vmla.f32    q5, q2, d1[1]\n"
+                "ldr        r1, [%[b_ptr], #0x7C]\n"
+                "vmla.f32    q6, q2, d2[0]\n"
+                "add        %[b_ptr], %[b_ptr], #0x80\n"
+
+                "vldr        d0, [%[a_ptr], #0x00]\n"
+                "vmov        d7, r0, r1\n"
+                "vmla.f32    q7, q2, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #0xC0]")
+                "vmla.f32    q8, q2, d3[0]\n"
+                "vmla.f32    q9, q2, d3[1]\n"
+
+                "vldr        d4, [%[b_ptr], #0x00]\n"
+                "vmla.f32    q10, q3, d1[0]\n"
+                "ldr        r0, [%[b_ptr], #0x08]\n"
+                "vmla.f32    q11, q3, d1[1]\n"
+                "ldr        r1, [%[b_ptr], #0x0C]\n"
+                "vmla.f32    q12, q3, d2[0]\n"
+                "subs        %[k], %[k], #1\n"
+
+                "vldr        d1, [%[a_ptr], #0x08]\n"
+                "vmov        d5, r0, r1\n"
+                "vmla.f32    q13, q3, d2[1]\n"
+                "ldr        r0, [%[a_ptr], #0x10]\n"
+                "vmla.f32    q14, q3, d3[0]\n"
+                "ldr        r1, [%[a_ptr], #0x14]\n"
+                "vmla.f32    q15, q3, d3[1]\n"
+                "bne        1b\n"
+
+                // "Tails" shows how many multiply blocks are needed at the
+                // end, must be 1-4 inclusive.  Bail out to alternative tail
+                // immediately if it's 1.
+                "6:\n"
+                "subs        %[tails], %[tails], #1\n"
+                "beq        3f\n"
+
+                // Detached final iteration - for now adapt the generic
+                // tails rather than reimplementing for A53.
+
+                // Unroll 0
+                "vmov        d2, r0, r1\n"
+                "add        %[a_ptr], %[a_ptr], #0x18\n"
+                "vmla.f32    q4, q2, d0[0]\n"
+                "vld1.32    {d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "add        %[b_ptr], %[b_ptr], #0x10\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d1[1]\n"
+                "vmla.f32    q8, q2, d2[0]\n"
+                "subs        %[tails], %[tails], #1\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d0[0]\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+                "vmla.f32    q13, q3, d1[1]\n"
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+                "vmla.f32    q15, q3, d2[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "beq        4f\n"
+
+                // Unroll 1
+                "vmla.f32    q4, q2, d3[0]\n"
+                "vmla.f32    q5, q2, d3[1]\n"
+                "subs        %[tails], %[tails], #1\n"
+                "vmla.f32    q6, q2, d0[0]\n"
+                "vmla.f32    q7, q2, d0[1]\n"
+                "vmla.f32    q8, q2, d1[0]\n"
+                "vmla.f32    q9, q2, d1[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d3[0]\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+                "vmla.f32    q13, q3, d0[1]\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+                "vmla.f32    q15, q3, d1[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "beq        5f\n"
+
+                // Unroll 2
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q4, q2, d2[0]\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+                "vmla.f32    q7, q2, d3[1]\n"
+                "vmla.f32    q8, q2, d0[0]\n"
+                "vmla.f32    q9, q2, d0[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d2[0]\n"
+                "vmla.f32    q11, q3, d2[1]\n"
+                "vmla.f32    q12, q3, d3[0]\n"
+                "vmla.f32    q13, q3, d3[1]\n"
+                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+
+                // Unroll 3
+                "vmla.f32    q4, q2, d1[0]\n"
+                "vmla.f32    q10, q3, d1[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q5, q2, d1[1]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d1[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d2[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d2[0]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d2[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d2[1]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d3[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d3[0]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d3[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d3[1]\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "b        2f\n"
+
+                // tails==1 final tail
+                "3:\n"
+                "vmov        d2, r0, r1\n"
+                "add        %[b_ptr], %[b_ptr], #0x10\n"
+                "vmla.f32    q4, q2, d0[0]\n"
+                "add        %[a_ptr], %[a_ptr], #0x18\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q10, q3, d0[0]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d1[1]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d1[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d2[0]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d2[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "b        2f\n"
+
+                // tails==2 final tail
+                "4:\n"
+                "vmla.f32    q4, q2, d3[0]\n"
+                "vmla.f32    q10, q3, d3[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q5, q2, d3[1]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d0[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d0[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d0[1]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d1[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d1[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d1[1]\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "b        2f\n"
+
+                // tails==3 final tail
+                "5:\n"
+                "vmla.f32    q4, q2, d2[0]\n"
+                "vld1.32    {d0}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q10, q3, d2[0]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d2[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d3[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d3[1]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d3[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d0[0]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d0[1]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+
+                "2:\n"
+                "vst1.32    {d30-d31}, [%[c_ptr] :128]!\n"
+                : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails)
+                :
+                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp
new file mode 100644
index 0000000..4cfb72a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp

@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 8x6), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a32_sgemm_8x6_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+    const float *a_ptr = Apanel;
+    float       *c_ptr = Cpanel;
+
+    /* Work out starting values for "k" and "tails" in the inner loop. */
+    int tails_initial = (K & 3);
+    if(tails_initial == 0)
+    {
+        tails_initial = 4;
+    }
+
+    int k_initial = ((K + 3) / 4) - 1;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            int tails = tails_initial;
+            int k     = k_initial;
+
+            a_ptr = a_ptr0;
+
+            __asm __volatile(
+                "vldr        d0, [%[a_ptr]]\n"
+                "vmov.i32    q4, #0\n"
+                "vldr        d1, [%[a_ptr], #0x08]\n"
+                "vmov.i32    q5, #0\n"
+                "vldr        d4, [%[b_ptr]]\n"
+                "vmov.i32    q6, #0\n"
+                "vldr        d5, [%[b_ptr], #0x08]\n"
+                "vmov.i32    q7, #0\n"
+                "vldr        d2, [%[a_ptr], #0x10]\n"
+                "vmov.i32    q8, #0\n" ASM_PREFETCH("[%[b_ptr], #0x40]") "vmov.i32    q9, #0\n" ASM_PREFETCH("[%[a_ptr], #0x40]") "vmov.i32    q10, #0\n" ASM_PREFETCH("[%[b_ptr], #0x80]") "vmov.i32    q11, #0\n"
+                ASM_PREFETCH("[%[a_ptr], #0x80]") "vmov.i32    q12, #0\n" ASM_PREFETCH("[%[b_ptr], #0XC0]") "vmov.i32    q13, #0\n" ASM_PREFETCH("[%[a_ptr], #0xC0]") "vmov.i32    q14, #0\n"
+                ASM_PREFETCH("[%[b_ptr], #0x100]") "vmov.i32    q15, #0\n" ASM_PREFETCH("[%[a_ptr], #0x100]") "cmp        %[k], #0\n" ASM_PREFETCH("[%[b_ptr], #0x140]") "beq        6f\n"
+                ASM_PREFETCH("[%[b_ptr], #0x180]")
+
+                "1:\n"
+                // Unroll 0
+                "vmla.f32    q4, q2, d0[0]\n"
+                "vldr        d6, [%[b_ptr], #0x10]\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "vldr        d7, [%[b_ptr], #0x18]\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+                "vldr        d3, [%[a_ptr], #0x18]\n"
+                "vmla.f32    q7, q2, d1[1]\n" ASM_PREFETCH("[%[a_ptr], #0x140]")
+                "vmla.f32    q8, q2, d2[0]\n"
+                "subs        %[k], %[k], #1\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+                "vldr        d4, [%[b_ptr], #0x20]\n"
+                "vmla.f32    q10, q3, d0[0]\n"
+                "vldr        d5, [%[b_ptr], #0x28]\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "vldr        d0, [%[a_ptr], #0x20]\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+
+                "vmla.f32    q13, q3, d1[1]\n"
+                "vldr        d1, [%[a_ptr], #0x28]\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+
+                "vmla.f32    q15, q3, d2[1]\n"
+                "vldr        d6, [%[b_ptr], #0x30]\n"
+
+                // Unroll 1
+                "vmla.f32    q4, q2, d3[0]\n"
+                "vldr        d7, [%[b_ptr], #0x38]\n"
+                "vmla.f32    q5, q2, d3[1]\n"
+                "vldr        d2, [%[a_ptr], #0x30]\n"
+                "vmla.f32    q6, q2, d0[0]\n"
+
+                "vmla.f32    q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #0x1C0]")
+                "vmla.f32    q8, q2, d1[0]\n"
+
+                "vmla.f32    q9, q2, d1[1]\n"
+                "vldr        d4, [%[b_ptr], #0x40]\n"
+                "vmla.f32    q10, q3, d3[0]\n"
+                "vldr        d5, [%[b_ptr], #0x48]\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "vldr        d3, [%[a_ptr], #0x38]\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+
+                "vmla.f32    q13, q3, d0[1]\n"
+                "vldr        d0, [%[a_ptr], #0x40]\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+
+                "vmla.f32    q15, q3, d1[1]\n"
+                "vldr        d6, [%[b_ptr], #0x50]\n"
+
+                // Unroll 2
+                "vmla.f32    q4, q2, d2[0]\n"
+                "vldr        d7, [%[b_ptr], #0x58]\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "vldr        d1, [%[a_ptr], #0x48]\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+
+                "vmla.f32    q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #0x180]")
+                "vmla.f32    q8, q2, d0[0]\n"
+
+                "vmla.f32    q9, q2, d0[1]\n"
+                "vldr        d4, [%[b_ptr], #0x60]\n"
+                "vmla.f32    q10, q3, d2[0]\n"
+                "vldr        d5, [%[b_ptr], #0x68]\n"
+                "vmla.f32    q11, q3, d2[1]\n"
+                "vldr        d2, [%[a_ptr], #0x50]\n"
+                "vmla.f32    q12, q3, d3[0]\n"
+
+                "vmla.f32    q13, q3, d3[1]\n"
+                "vldr        d3, [%[a_ptr], #0x58]\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "add        %[a_ptr], %[a_ptr], #0x60\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+                "vldr        d6, [%[b_ptr], #0x70]\n"
+
+                // Unroll 3
+                "vmla.f32    q4, q2, d1[0]\n"
+                "vldr        d7, [%[b_ptr], #0x78]\n"
+                "vmla.f32    q5, q2, d1[1]\n"
+                "add        %[b_ptr], %[b_ptr], #0x80\n"
+                "vmla.f32    q6, q2, d2[0]\n"
+                "vldr        d0, [%[a_ptr], #0x00]\n"
+                "vmla.f32    q7, q2, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #0x180]")
+                "vmla.f32    q8, q2, d3[0]\n"
+
+                "vmla.f32    q9, q2, d3[1]\n"
+                "vldr        d4, [%[b_ptr], #0x00]\n"
+                "vmla.f32    q10, q3, d1[0]\n"
+                "vldr        d5, [%[b_ptr], #0x08]\n"
+                "vmla.f32    q11, q3, d1[1]\n"
+                "vldr        d1, [%[a_ptr], #0x08]\n"
+                "vmla.f32    q12, q3, d2[0]\n"
+
+                "vmla.f32    q13, q3, d2[1]\n"
+                "vldr        d2, [%[a_ptr], #0x10]\n"
+                "vmla.f32    q14, q3, d3[0]\n"
+
+                "vmla.f32    q15, q3, d3[1]\n"
+                "bne        1b\n"
+
+                // "Tails" shows how many multiply blocks are needed at the
+                // end, must be 1-4 inclusive.  Bail out to alternative tail
+                // immediately if it's 1.
+                "6:\n"
+                "subs        %[tails], %[tails], #1\n"
+                "beq        3f\n"
+
+                // Detached final iteration
+
+                // Unroll 0
+                "vmla.f32    q4, q2, d0[0]\n"
+                "vldr        d6, [%[b_ptr], #0x10]\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "vldr        d7, [%[b_ptr], #0x18]\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+                "vldr        d3, [%[a_ptr], #0x18]\n"
+                "vmla.f32    q7, q2, d1[1]\n"
+                "subs        %[tails], %[tails], #1\n"
+                "vmla.f32    q8, q2, d2[0]\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+                "vldr        d4, [%[b_ptr], #0x20]\n"
+
+                "vmla.f32    q10, q3, d0[0]\n"
+                "vldr        d5, [%[b_ptr], #0x28]\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "vldr        d0, [%[a_ptr], #0x20]\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+                "add        %[b_ptr], %[b_ptr], #0x30\n"
+                "vmla.f32    q13, q3, d1[1]\n"
+                "vldr        d1, [%[a_ptr], #0x28]\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+                "vmla.f32    q15, q3, d2[1]\n"
+                "beq        4f\n"
+
+                // Unroll 1
+                "vmla.f32    q4, q2, d3[0]\n"
+                "vldr        d6, [%[b_ptr], #0x30]\n"
+                "vmla.f32    q5, q2, d3[1]\n"
+                "vldr        d7, [%[b_ptr], #0x38]\n"
+                "vmla.f32    q6, q2, d0[0]\n"
+                "vldr        d2, [%[a_ptr], #0x30]\n"
+                "vmla.f32    q7, q2, d0[1]\n"
+                "subs        %[tails], %[tails], #1\n"
+                "vmla.f32    q8, q2, d1[0]\n"
+
+                "vmla.f32    q9, q2, d1[1]\n"
+
+                "vmla.f32    q10, q3, d3[0]\n"
+                "vldr        d4, [%[b_ptr], #0x40]\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "vldr        d5, [%[b_ptr], #0x48]\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+                "vldr        d3, [%[a_ptr], #0x38]\n"
+                "vmla.f32    q13, q3, d0[1]\n"
+                "vldr        d0, [%[a_ptr], #0x40]\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+                "vmla.f32    q15, q3, d1[1]\n"
+                "beq        5f\n"
+
+                // Unroll 2
+                "vmla.f32    q4, q2, d2[0]\n"
+                "vldr        d6, [%[b_ptr], #0x50]\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "vldr        d7, [%[b_ptr], #0x58]\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+                "vldr        d1, [%[a_ptr], #0x48]\n"
+                "vmla.f32    q7, q2, d3[1]\n"
+                "vmla.f32    q8, q2, d0[0]\n"
+                "vmla.f32    q9, q2, d0[1]\n"
+
+                "vmla.f32    q10, q3, d2[0]\n"
+                "vldr        d4, [%[b_ptr], #0x60]\n"
+                "vmla.f32    q11, q3, d2[1]\n"
+                "vldr        d5, [%[b_ptr], #0x68]\n"
+                "vmla.f32    q12, q3, d3[0]\n"
+                "vldr        d2, [%[a_ptr], #0x50]\n"
+                "vmla.f32    q13, q3, d3[1]\n"
+                "vldr        d3, [%[a_ptr], #0x58]\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+
+                // Unroll 3
+                "vmla.f32    q4, q2, d1[0]\n"
+                "vldr        d6, [%[b_ptr], #0x70]\n"
+                "vmla.f32    q5, q2, d1[1]\n"
+                "vldr        d7, [%[b_ptr], #0x78]\n"
+                "vmla.f32    q10, q3, d1[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d1[1]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d2[0]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d2[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d2[1]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d2[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d3[0]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d3[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d3[1]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d3[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "add        %[a_ptr], %[a_ptr], #0x60\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "add        %[b_ptr], %[b_ptr], #0x80\n"
+                "b        2f\n"
+
+                // tails==1 final tail
+                "3:\n"
+                "vmla.f32    q4, q2, d0[0]\n"
+                "vldr        d6, [%[b_ptr], #0x10]\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "vldr        d7, [%[b_ptr], #0x18]\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q10, q3, d0[0]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d1[1]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d1[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d2[0]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d2[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "add        %[a_ptr], %[a_ptr], #0x18\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "add        %[b_ptr], %[b_ptr], #0x20\n"
+                "b        2f\n"
+
+                // tails==2 final tail
+                "4:\n"
+                "vmla.f32    q4, q2, d3[0]\n"
+                "vldr        d6, [%[b_ptr], #0x30]\n"
+                "vmla.f32    q5, q2, d3[1]\n"
+                "vldr        d7, [%[b_ptr], #0x38]\n"
+                "vmla.f32    q10, q3, d3[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d0[0]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d0[1]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d0[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d1[0]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d1[1]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d1[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "add        %[b_ptr], %[b_ptr], #0x40\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "add        %[a_ptr], %[a_ptr], #0x30\n"
+                "b        2f\n"
+
+                // tails==3 final tail
+                "5:\n"
+                "vmla.f32    q4, q2, d2[0]\n"
+                "vldr        d6, [%[b_ptr], #0x50]\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "vldr        d7, [%[b_ptr], #0x58]\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q10, q3, d2[0]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d2[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d3[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d3[1]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d3[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d0[0]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d0[1]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "add        %[a_ptr], %[a_ptr], #0x48\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "add        %[b_ptr], %[b_ptr], #0x60\n"
+
+                "2:\n"
+                "vst1.32    {d30-d31}, [%[c_ptr] :128]!\n"
+                : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails)
+                :
+                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp
new file mode 100644
index 0000000..d7d0484
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp

@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 8x6), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a32_sgemm_8x6(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+    const float *a_ptr = Apanel;
+    float       *c_ptr = Cpanel;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr     = a_ptr0;
+            int tails = (K & 3);
+            if(tails == 0)
+            {
+                tails = 4;
+            }
+            int k = ((K + 3) / 4) - 1;
+
+            __asm __volatile(
+                "vmov.i32    q4, #0\n"
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
+                "vmov.i32    q5, #0\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+                "vmov.i32    q6, #0\n" ASM_PREFETCH("[%[a_ptr], #48]") "vmov.i32    q7, #0\n" ASM_PREFETCH("[%[b_ptr], #48]") "vmov.i32    q8, #0\n" ASM_PREFETCH("[%[a_ptr], #112]") "vmov.i32    q9, #0\n"
+                ASM_PREFETCH("[%[b_ptr], #112]")
+                "vmov.i32    q10, #0\n"
+                "vmov.i32    q11, #0\n"
+                "vmov.i32    q12, #0\n"
+                "vmov.i32    q13, #0\n" ASM_PREFETCH("[%[a_ptr], #176]") "vmov.i32    q14, #0\n" ASM_PREFETCH("[%[b_ptr], #176]")
+                "vmov.i32    q15, #0\n"
+
+                "cmp        %[k], #0\n"
+                "beq        6f\n"
+
+                "1:\n"
+                // Unroll 0
+                "vmla.f32    q4, q2, d0[0]\n"
+                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d1[1]\n"
+                "vmla.f32    q8, q2, d2[0]\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d0[0]\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+                "vmla.f32    q13, q3, d1[1]\n"
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+                "vmla.f32    q15, q3, d2[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+
+                // Unroll 1
+                "vmla.f32    q4, q2, d3[0]\n"
+                "subs        %[k], %[k], #1\n"
+                "vmla.f32    q5, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #208]")
+                "vmla.f32    q6, q2, d0[0]\n"
+                "vmla.f32    q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #192]")
+                "vmla.f32    q8, q2, d1[0]\n"
+                "vmla.f32    q9, q2, d1[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d3[0]\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+                "vmla.f32    q13, q3, d0[1]\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+                "vmla.f32    q15, q3, d1[1]\n"
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
+
+                // Unroll 2
+                "vmla.f32    q4, q2, d2[0]\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+                "vmla.f32    q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #240]")
+                "vmla.f32    q8, q2, d0[0]\n"
+                "vmla.f32    q9, q2, d0[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d2[0]\n"
+                "vmla.f32    q11, q3, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #208]")
+                "vmla.f32    q12, q3, d3[0]\n"
+                "vmla.f32    q13, q3, d3[1]\n"
+                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+
+                // Unroll 3
+                "vmla.f32    q4, q2, d1[0]\n"
+                "vmla.f32    q5, q2, d1[1]\n"
+                "vmla.f32    q6, q2, d2[0]\n"
+                "vmla.f32    q7, q2, d2[1]\n"
+                "vmla.f32    q8, q2, d3[0]\n"
+                "vmla.f32    q9, q2, d3[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d1[0]\n"
+                "vmla.f32    q11, q3, d1[1]\n"
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q12, q3, d2[0]\n"
+                "vmla.f32    q13, q3, d2[1]\n"
+                "vmla.f32    q14, q3, d3[0]\n"
+                "vmla.f32    q15, q3, d3[1]\n"
+                "bne        1b\n"
+
+                // Branch here if we never execute main loop.
+                "6:\n"
+
+                // "Tails" shows how many multiply blocks are needed at the
+                // end, must be 1-4 inclusive.  Bail out to alternative tail
+                // immediately if it's 1.
+                "subs        %[tails], %[tails], #1\n"
+                "beq        3f\n"
+
+                // Detached final iteration
+                // Unroll 0
+                "vmla.f32    q4, q2, d0[0]\n"
+                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d1[1]\n"
+                "vmla.f32    q8, q2, d2[0]\n"
+                "subs        %[tails], %[tails], #1\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d0[0]\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+                "vmla.f32    q13, q3, d1[1]\n"
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+                "vmla.f32    q15, q3, d2[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "beq        4f\n"
+
+                // Unroll 1
+                "vmla.f32    q4, q2, d3[0]\n"
+                "vmla.f32    q5, q2, d3[1]\n"
+                "subs        %[tails], %[tails], #1\n"
+                "vmla.f32    q6, q2, d0[0]\n"
+                "vmla.f32    q7, q2, d0[1]\n"
+                "vmla.f32    q8, q2, d1[0]\n"
+                "vmla.f32    q9, q2, d1[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d3[0]\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+                "vmla.f32    q13, q3, d0[1]\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+                "vmla.f32    q15, q3, d1[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "beq        5f\n"
+
+                // Unroll 2
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q4, q2, d2[0]\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+                "vmla.f32    q7, q2, d3[1]\n"
+                "vmla.f32    q8, q2, d0[0]\n"
+                "vmla.f32    q9, q2, d0[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d2[0]\n"
+                "vmla.f32    q11, q3, d2[1]\n"
+                "vmla.f32    q12, q3, d3[0]\n"
+                "vmla.f32    q13, q3, d3[1]\n"
+                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+
+                // Unroll 3
+                "vmla.f32    q4, q2, d1[0]\n"
+                "vmla.f32    q10, q3, d1[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q5, q2, d1[1]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d1[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d2[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d2[0]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d2[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d2[1]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d3[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d3[0]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d3[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d3[1]\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "b        2f\n"
+
+                // tails==1 final tail
+                "3:\n"
+                "vmla.f32    q4, q2, d0[0]\n"
+                "vld1.32    {d2}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q10, q3, d0[0]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d1[1]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d1[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d2[0]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d2[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "b        2f\n"
+
+                // tails==2 final tail
+                "4:\n"
+                "vmla.f32    q4, q2, d3[0]\n"
+                "vmla.f32    q10, q3, d3[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q5, q2, d3[1]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d0[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d0[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d0[1]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d1[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d1[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d1[1]\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "b        2f\n"
+
+                // tails==3 final tail
+                "5:\n"
+                "vmla.f32    q4, q2, d2[0]\n"
+                "vld1.32    {d0}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q10, q3, d2[0]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d2[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d3[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d3[1]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d3[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d0[0]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d0[1]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+
+                "2:\n"
+                "vst1.32    {d30-d31}, [%[c_ptr] :128]!\n"
+                : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails)
+                :
+                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
new file mode 100644
index 0000000..387f899
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp

@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_gemm_s16_asimd_12x8(const int16_t *, const int16_t *, int32_t *, int, int, int);
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics.  The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class gemm_s16_12x8
+{
+public:
+    typedef int16_t operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)(const int16_t *, const int16_t *, int32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int A_interleave = 8;
+    static const int A_block      = 1;
+    static const int A_transpose  = 0;
+
+    /* Same for B input */
+    static const int B_interleave = 12;
+    static const int B_block      = 1;
+    static const int B_transpose  = 1;
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 12;
+    static const int out_height = 8;
+    static const int k_unroll   = 1;
+
+    kern_type kernel = a64_gemm_s16_asimd_12x8;
+
+    gemm_s16_12x8(const CPUInfo *ci)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp
new file mode 100644
index 0000000..b217dcf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp

@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+    const int16_t *a_ptr = Apanel;
+    int32_t       *c_ptr = Cpanel;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const int16_t *a_ptr0 = a_ptr;
+        const int16_t *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr            = a_ptr0;
+            const bool odd_k = K & 0x1;
+            int        k     = (K + 1) / 2 - 1;
+
+            register int16x8_t aa asm("v0");
+            register int16x8_t ab asm("v1");
+            register int16x8_t b0 asm("v2");
+            register int16x8_t b1 asm("v3");
+            register int16x8_t b2 asm("v4");
+
+            __asm __volatile(
+                "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower
+                "movi v5.4s, #0\n"
+                "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper
+                "movi v6.4s, #0\n"
+                "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower
+                "ins %[aa].d[1], x20\n"     // Merge A[A].lower and upper
+                "movi v7.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi v8.4s, #0\n"
+                "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper
+                "movi v9.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi v10.4s, #0\n"
+                "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower
+                "ins %[b0].d[1], x20\n"            // Merge B[0].lower and upper
+                "movi v11.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #96]")
+                "movi v12.4s, #0\n"
+                "movi v13.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #96]")
+                "movi v14.4s, #0\n"
+                "movi v15.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi v16.4s, #0\n"
+                "movi v17.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi v18.4s, #0\n"
+                "movi v19.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #160]")
+                "movi v20.4s, #0\n"
+                "movi v21.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #160]")
+                "movi v22.4s, #0\n"
+                "movi v23.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi v24.4s, #0\n"
+                "add %x[a_ptr], %x[a_ptr], #0x10\n"
+                "movi v25.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi v26.4s, #0\n"
+                "add %x[b_ptr], %x[b_ptr], #0x18\n"
+                "movi v27.4s, #0\n"
+                "movi v28.4s, #0\n"
+
+                "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations.
+
+                "1:\n" // Main loop
+                // First unroll
+                "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+                "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
+                "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+                "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+                "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
+                "ins %[b1].d[1], x20\n"     // Merge B[1].lower and .upper
+                "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+                "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+                "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
+                "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+                "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+                "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
+                "ins %[ab].d[1], x20\n"           // Merge A[B].lower and .upper
+                "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+                "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+                "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
+                "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+                "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+                "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+                "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+                "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+                "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+                "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+                "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower
+                "ins %[b2].d[1], x20\n"            // Merge B[2].lower and .upper
+                "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+                "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+                "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper
+                "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+                "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+                "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+                "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+                "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+                "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+                // Second unroll
+                "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+                "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower
+                "ins %[b0].d[1], x20\n"            // Merge B[0].lower and .upper
+                "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+                "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+                "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper
+                "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+                "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+                "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+                "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+                "add %x[a_ptr], %x[a_ptr], #0x20\n"
+                "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+                "smlal v13.4s, %[b2].4h, %[ab].h[0]\n" ASM_PREFETCH("[%[b_ptr], #320]")
+                "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+                "smlal v15.4s, %[b2].4h, %[ab].h[2]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+                "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+                "smlal v17.4s, %[b2].4h, %[ab].h[4]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+                "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+                "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+                "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+                "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+                "subs %x[k], %x[k], #0x1\n"
+                "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+                "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+                "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower
+                "ins %[aa].d[1], x20\n"            // Merge A[A].lower and .upper
+                "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+                "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+                "add %x[b_ptr], %x[b_ptr], #0x30\n"
+                "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+                "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+                "bne 1b\n"
+
+                "2:\n" // Even tail
+                "cbnz %x[odd_k], 3f\n"
+
+                "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+                "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
+                "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+                "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+                "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
+                "ins %[b1].d[1], x20\n"     // Merge B[1].lower and .upper
+                "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+                "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+                "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
+                "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+                "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+                "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
+                "ins %[ab].d[1], x20\n"           // Merge A[B].lower and .upper
+                "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+                "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+                "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
+                "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+                "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+                "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+                "add %[a_ptr], %[a_ptr], #0x10\n"
+                "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+                "add %[b_ptr], %[b_ptr], #0x18\n"
+                "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+                "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+                "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+                "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
+                "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+                "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+                "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+                "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+                "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+                "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+                "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+                "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+                "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+                "smlal v13.4s, %[b2].4h, %[ab].h[0]\n"
+                "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+                "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+                "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+                "str q5, [%x[c_ptr]]\n"
+                "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+                "str q13, [%x[c_ptr], #0x10]\n"
+                "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+                "str q21, [%x[c_ptr], #0x20]\n"
+                "smlal v15.4s, %[b2].4h, %[ab].h[2]\n"
+                "str q6, [%x[c_ptr], #0x30]\n"
+                "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+                "str q14, [%x[c_ptr], #0x40]\n"
+                "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+                "str q22, [%x[c_ptr], #0x50]\n"
+                "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+                "str q7, [%x[c_ptr], #0x60]\n"
+                "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+                "str q15, [%x[c_ptr], #0x70]\n"
+                "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+                "str q23, [%x[c_ptr], #0x80]\n"
+                "smlal v17.4s, %[b2].4h, %[ab].h[4]\n"
+                "str q8, [%x[c_ptr], #0x90]\n"
+                "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+                "str q16, [%x[c_ptr], #0xa0]\n"
+                "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+                "str q24, [%x[c_ptr], #0xb0]\n"
+                "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+                "str q9, [%x[c_ptr], #0xc0]\n"
+                "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+                "str q17, [%x[c_ptr], #0xd0]\n"
+                "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+                "str q25, [%x[c_ptr], #0xe0]\n"
+                "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+                "str q10, [%x[c_ptr], #0xf0]\n"
+                "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+                "str q18, [%x[c_ptr], #0x100]\n"
+                "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+                "str q26, [%x[c_ptr], #0x110]\n"
+                "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+                "str q11, [%x[c_ptr], #0x120]\n"
+                "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+                "str q19, [%x[c_ptr], #0x130]\n"
+                "b 4f\n" // Complete write out
+
+                "3:\n" // Odd tail
+                "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+                "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+                "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+                "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+                "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+                "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+                "str q5, [%x[c_ptr]]\n"
+                "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+                "str q13, [%x[c_ptr], #0x10]\n"
+                "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+                "str q21, [%x[c_ptr], #0x20]\n"
+                "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+                "str q6, [%x[c_ptr], #0x30]\n"
+                "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+                "str q14, [%x[c_ptr], #0x40]\n"
+                "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+                "str q22, [%x[c_ptr], #0x50]\n"
+                "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+                "str q7, [%x[c_ptr], #0x60]\n"
+                "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+                "str q15, [%x[c_ptr], #0x70]\n"
+                "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+                "str q23, [%x[c_ptr], #0x80]\n"
+                "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+                "str q8, [%x[c_ptr], #0x90]\n"
+                "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+                "str q16, [%x[c_ptr], #0xa0]\n"
+                "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+                "str q24, [%x[c_ptr], #0xb0]\n"
+                "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+                "str q9, [%x[c_ptr], #0xc0]\n"
+                "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+                "str q17, [%x[c_ptr], #0xd0]\n"
+                "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+                "str q25, [%x[c_ptr], #0xe0]\n"
+                "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+                "str q10, [%x[c_ptr], #0xf0]\n"
+                "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+                "str q18, [%x[c_ptr], #0x100]\n"
+                "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+                "str q26, [%x[c_ptr], #0x110]\n"
+                "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+                "str q11, [%x[c_ptr], #0x120]\n"
+
+                "4:\n" // End of function
+                "str q19, [%x[c_ptr], #0x130]\n"
+                "str q27, [%x[c_ptr], #0x140]\n"
+                "str q12, [%x[c_ptr], #0x150]\n"
+                "str q20, [%x[c_ptr], #0x160]\n"
+                "str q28, [%x[c_ptr], #0x170]\n"
+                "add %x[c_ptr], %x[c_ptr], #0x180\n"
+                : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k),
+                [aa] "+w"(aa), [ab] "+w"(ab), [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2)
+                : [odd_k] "r"(odd_k)
+                : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
new file mode 100644
index 0000000..08f90e1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+
+namespace arm_gemm
+{
+// Load the actual kernel
+void a64_gemm_s8_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_12x8_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+class gemm_s8_12x8
+{
+public:
+    typedef int8_t  operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int  A_interleave = 8;
+    static const int  A_block      = 4;
+    static const bool A_transpose  = false;
+
+    /* Same for B input */
+    static const int  B_interleave = 12;
+    static const int  B_block      = 4;
+    static const bool B_transpose  = true;
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 12;
+    static const int out_height = 8;
+    static const int k_unroll   = 4;
+
+    kern_type kernel = a64_gemm_s8_12x8;
+
+    gemm_s8_12x8(const CPUInfo *ci)
+    {
+        if(ci->get_cpu_model() == CPUModel::A55r1)
+        {
+            kernel = a64_gemm_s8_12x8_a55r1;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp
new file mode 100644
index 0000000..ef2f291
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp

@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+#include "dot_toolchain_support.h"
+#endif
+
+namespace arm_gemm
+{
+void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, const int ablocks, const int bblocks, const int K)
+{
+    const int8_t *a_ptr = Apanel;
+    int32_t      *c_ptr = Cpanel;
+
+    // We divide K by 4 because the sdot instruction processes 4 elements at a time.
+    const int W = K / 4;
+
+    // Fix up for odd lengths - set a flag if K is odd, but make
+    // sure we round up the iteration count.
+    const int oddk    = (W & 1);
+    const int k_iters = ((W + 1) / 2) - 1;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const int8_t *a_ptr0 = a_ptr;
+        const int8_t *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+            int k = k_iters;
+
+            register int32x4_t a0 asm("v0");
+            register int32x4_t a1 asm("v1");
+            register int32x4_t b0 asm("v2");
+            register int32x4_t b1 asm("v3");
+            register int32x4_t b2 asm("v4");
+            register int32x4_t a0a asm("v5");
+            register int32x4_t a1a asm("v6");
+
+            __asm __volatile(
+#ifdef NO_DOT_IN_TOOLCHAIN
+                _DECLARE_SDOT
+#else
+                ".arch armv8.2-a+dotprod\n"
+#endif
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi   v8.4s, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi   v9.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi   v10.4s, #0x0\n"
+                "ldr    %q[a1], [%[a_ptr], #16]\n"
+                "movi   v11.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi   v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi   v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi   v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi   v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi   v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi   v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi   v18.4s, #0x0\n"
+                "movi   v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi   v20.4s, #0x0\n"
+                "movi   v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi   v22.4s, #0x0\n"
+                "movi   v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi   v24.4s, #0x0\n"
+                "movi   v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi   v26.4s, #0x0\n"
+                "movi   v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "movi   v28.4s, #0x0\n"
+                "movi   v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]")
+                "movi   v30.4s, #0x0\n"
+                "movi   v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]")
+
+                // The loop is offset by these two instructions which must
+                // always be executed.
+                "sdot   v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                "1:\n"
+                "sdot      v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "sdot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "sdot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+
+                "sdot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "sdot    v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "sdot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "sdot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+
+                "sdot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "sdot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "sdot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+
+                "sdot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "sdot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "sdot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "sdot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+
+                "sdot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "sdot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "sdot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot    v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCH("[%[a_ptr], #448]")
+
+                "sdot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "sdot    v29.4s, %[b2].16b, %[a1].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #576]")
+                "sdot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "sdot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+                // Unroll 1
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+
+                "sdot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "sdot    v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "sdot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "sdot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "ldr    %d[a0], [%[a_ptr], #64]\n"
+
+                "sdot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #72]\n"
+                "sdot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "sdot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "ldr    %d[a1], [%[a_ptr], #80]\n"
+
+                "sdot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "ins    %[a0].d[1], x20\n"
+                "sdot    v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #88]\n"
+                "sdot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "sdot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #96]\n"
+
+                "sdot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "ins    %[a1].d[1], x20\n"
+                "sdot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #104]\n"
+                "sdot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "sdot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #112]\n"
+
+                "sdot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "sdot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #120]\n"
+                "sdot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "sdot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+
+                "sdot    v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCH("[%[b_ptr], #640]")
+                "sdot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "sdot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "ins    %[b1].d[1], x20\n"
+                "sdot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+
+                "sdot   v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "b.ne    1b\n"
+
+                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
+                "4:\n"
+
+                // Start final iteration - branch off to "odd" code before we load a0a.
+                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "sdot   v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "cbnz   %w[oddk], 2f\n"
+
+                // Even K continuation
+                "sdot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+
+                "sdot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "sdot    v14.4s, %[b0].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr]]")
+                "sdot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+
+                "sdot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "sdot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "sdot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+
+                "sdot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "sdot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "sdot    v22.4s, %[b1].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "sdot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+
+                "sdot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "sdot    v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+                "sdot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+
+                "sdot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "sdot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "sdot    v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "sdot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+
+                "sdot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "sdot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "sdot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "ins    %[b2].d[1], x20\n"
+
+                "sdot   v11.4s, %[b0].16b, %[a0a].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "sdot   v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "sdot   v14.4s, %[b0].16b, %[a1a].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "sdot   v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "sdot   v16.4s, %[b1].16b, %[a0a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "sdot   v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "sdot   v18.4s, %[b1].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "sdot   v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "sdot   v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "sdot   v21.4s, %[b1].16b, %[a1a].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+                "sdot   v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "sdot   v23.4s, %[b1].16b, %[a1a].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+                "sdot   v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "sdot   v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "sdot   v26.4s, %[b2].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+                "sdot   v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "sdot   v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "sdot   v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "sdot   v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "sdot   v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "b      3f\n"
+
+                // Odd K continuation
+                "2:\n"
+                "sdot   v11.4s, %[b0].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+                "sdot   v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "sdot   v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "sdot   v15.4s, %[b0].16b, %[a1].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+                "sdot   v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "sdot   v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "sdot   v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot   v19.4s, %[b1].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "sdot   v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "sdot   v21.4s, %[b1].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "sdot   v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "sdot   v23.4s, %[b1].16b, %[a1].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "sdot   v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "sdot   v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "sdot   v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot   v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "sdot   v28.4s, %[b2].16b, %[a1].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "sdot   v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #640]") "sdot   v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "sdot   v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+                // Common tail
+                "3:\n"
+                "str    q8,   [%[c_ptr]]\n"
+                "str    q16,  [%[c_ptr], #16]\n"
+                "str    q24,  [%[c_ptr], #32]\n"
+                "str    q9,   [%[c_ptr], #48]\n"
+                "str    q17,  [%[c_ptr], #64]\n"
+                "str    q25,  [%[c_ptr], #80]\n"
+                "str    q10,  [%[c_ptr], #96]\n"
+                "str    q18,  [%[c_ptr], #112]\n"
+                "str    q26,  [%[c_ptr], #128]\n"
+                "str    q11,  [%[c_ptr], #144]\n"
+                "str    q19,  [%[c_ptr], #160]\n"
+                "str    q27,  [%[c_ptr], #176]\n"
+                "str    q12,  [%[c_ptr], #192]\n"
+                "str    q20,  [%[c_ptr], #208]\n"
+                "str    q28,  [%[c_ptr], #224]\n"
+                "str    q13,  [%[c_ptr], #240]\n"
+                "str    q21,  [%[c_ptr], #256]\n"
+                "str    q29,  [%[c_ptr], #272]\n"
+                "str    q14,  [%[c_ptr], #288]\n"
+                "str    q22,  [%[c_ptr], #304]\n"
+                "str    q30,  [%[c_ptr], #320]\n"
+                "str    q15,  [%[c_ptr], #336]\n"
+                "str    q23,  [%[c_ptr], #352]\n"
+                "str    q31,  [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+                ".purgem sdot\n"
+#endif
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif
\ No newline at end of file

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
new file mode 100644
index 0000000..c76f99d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h

@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
+#define _DECLARE_SDOT                                                                                  \
+    ".altmacro\n"                                                                                      \
+    ".macro sdot opd:req, opn:req, opm:req\n"                                                          \
+    "local vd, vn, vm, h, l\n"                                                                         \
+    ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n" \
+    ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"                                                               \
+    ".set vd,\\reg\n"                                                                                  \
+    ".endif\n"                                                                                         \
+    ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"                                                              \
+    ".set vn,\\reg\n"                                                                                  \
+    ".endif\n"                                                                                         \
+    ".irp idx,0,1,2,3\n"                                                                               \
+    ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"                                                      \
+    ".set vm,\\reg\n"                                                                                  \
+    ".set h,\\idx / 2\n"                                                                               \
+    ".set l,\\idx %% 2\n"                                                                              \
+    ".endif\n"                                                                                         \
+    ".endr\n"                                                                                          \
+    ".endr\n"                                                                                          \
+    ".ifndef vd\n"                                                                                     \
+    ".error \"Bad operand \\opd\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".ifndef vn\n"                                                                                     \
+    ".error \"Bad operand \\opn\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".ifndef vm\n"                                                                                     \
+    ".error \"Bad operand \\opm\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".ifndef h\n"                                                                                      \
+    ".error \"Bad operand \\opm\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".ifndef l\n"                                                                                      \
+    ".error \"Bad operand \\opm\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".int     0x4f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"                      \
+    ".endm\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp
new file mode 100644
index 0000000..258ef5e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp

@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+#include "dot_toolchain_support.h"
+#endif
+
+namespace arm_gemm
+{
+void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+    const int8_t *a_ptr = Apanel;
+    int32_t      *c_ptr = Cpanel;
+    // We divide K by 4 because the sdot instruction processes 4 elements at a time.
+    const int W = K / 4;
+    // Fix up for odd lengths - set a flag if K is odd, but make
+    // sure we round up the iteration count.
+    const int oddk         = (W & 1);
+    const int init_value_k = ((W + 1) / 2) - 1;
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const int8_t *a_ptr0 = a_ptr;
+        const int8_t *b_ptr  = Bpanel;
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr                = a_ptr0;
+            int                k = init_value_k;
+            register int32x4_t a0 asm("v0");
+            register int32x4_t a1 asm("v1");
+            register int32x4_t b0 asm("v2");
+            register int32x4_t b1 asm("v3");
+            register int32x4_t b2 asm("v4");
+            register int32x4_t a0a asm("v5");
+            register int32x4_t a1a asm("v6");
+            __asm __volatile(
+#ifdef NO_DOT_IN_TOOLCHAIN
+                _DECLARE_SDOT
+#else
+                ".arch  armv8.2-a+dotprod\n"
+#endif
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi    v8.4s, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi    v9.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v10.4s, #0x0\n"
+                "ldr    %q[a1], [%[a_ptr], #16]\n"
+                "movi    v11.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]") "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi    v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi    v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi    v22.4s, #0x0\n"
+                "movi    v23.4s, #0x0\n"
+                "movi    v24.4s, #0x0\n"
+                "movi    v25.4s, #0x0\n"
+                "movi    v26.4s, #0x0\n"
+                "movi    v27.4s, #0x0\n"
+                "movi    v28.4s, #0x0\n"
+                "movi    v29.4s, #0x0\n"
+                "movi    v30.4s, #0x0\n"
+                "movi    v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                // Loop proper
+                "1:\n"
+                "sdot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "sdot      v9.4s , %[b0].16b, %[a0].4b[1]\n"
+
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "sdot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "sdot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr    %q[a0a], [%[a_ptr], #32]\n"
+                "sdot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "sdot    v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    %q[a1a], [%[a_ptr], #48]\n"
+                "sdot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "sdot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #48]\n"
+
+                "sdot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "sdot    v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+                "sdot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "sdot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "sdot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "sdot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "sdot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #64]\n"
+
+                "sdot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "sdot    v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "sdot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "sdot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "sdot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "sdot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "sdot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr    %q[b2], [%[b_ptr], #80]\n"
+
+                "sdot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "sdot    v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr    %q[a0], [%[a_ptr], #64]\n"
+                "sdot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "sdot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "sdot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "ldr    %q[a1], [%[a_ptr], #80]\n"
+                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "sdot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "sdot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #96]\n"
+
+                "sdot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "sdot    v17.4s, %[b1].16b, %[a0a].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
+                "sdot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "sdot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "sdot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "sdot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "sdot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "sdot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #112]\n"
+
+                "sdot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "sdot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "sdot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "sdot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "sdot    v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "sdot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "sdot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "sdot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "bne    1b\n"
+
+                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz    %w[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                "sdot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "sdot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "sdot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr    %q[a0a], [%[a_ptr], #32]\n"
+                "sdot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    %q[a1a], [%[a_ptr], #48]\n"
+                "sdot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "sdot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #48]\n"
+
+                "sdot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "sdot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "sdot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "sdot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "sdot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "sdot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "sdot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #64]\n"
+
+                "sdot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "sdot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "sdot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "sdot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "sdot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "sdot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "sdot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr    %q[b2], [%[b_ptr], #80]\n"
+
+                "sdot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+
+                "sdot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "sdot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "str    q8, [%[c_ptr], #0]\n"
+                "sdot    v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+                "sdot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "str    q24, [%[c_ptr], #32]\n"
+
+                "sdot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "str    q9, [%[c_ptr], #48]\n"
+                "sdot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+                "sdot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "sdot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+
+                "sdot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+                "sdot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "sdot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+
+                "sdot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+                "sdot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "sdot    v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+
+                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+                "sdot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "sdot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+
+                "sdot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+                "sdot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "sdot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+
+                "sdot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+                "sdot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "sdot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+
+                "b    3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                "sdot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "sdot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "str    q8, [%[c_ptr], #0]\n"
+                "sdot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+                "sdot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "str    q24, [%[c_ptr], #32]\n"
+                "sdot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "str    q9, [%[c_ptr], #48]\n"
+
+                "sdot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+                "sdot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "sdot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+
+                "sdot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+                "sdot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "sdot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+
+                "sdot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+                "sdot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "sdot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+
+                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+                "sdot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "sdot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+
+                "sdot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+                "sdot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "sdot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+
+                "sdot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+                "sdot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "sdot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+
+                // Common tail
+                "3:\n"
+                "str    q23, [%[c_ptr], #352]\n"
+                "str    q31, [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+                ".purgem sdot\n"
+#endif
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
new file mode 100644
index 0000000..2ec28f4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp

@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Load the actual kernel
+void a64_gemm_s8_4x4(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+#include "arm_gemm.hpp"
+
+class gemm_s8_4x4
+{
+public:
+    typedef int8_t  operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int  A_interleave = 4;
+    static const int  A_block      = 16;
+    static const bool A_transpose  = false;
+
+    /* Same for B input */
+    static const int  B_interleave = 4;
+    static const int  B_block      = 16;
+    static const bool B_transpose  = true;
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 4;
+    static const int out_height = 4;
+    static const int k_unroll   = 16;
+
+    kern_type kernel = a64_gemm_s8_4x4;
+
+    gemm_s8_4x4(const CPUInfo *ci)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp
new file mode 100644
index 0000000..243b94e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp

@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+    const int8_t *a_ptr = Apanel;
+    int32_t      *c_ptr = Cpanel;
+
+    K /= 16;
+    int oddk = (K & 1);
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const int8_t *a_ptr0 = a_ptr;
+        const int8_t *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+
+            int k = ((K + 1) / 2) - 1;
+
+            register int8x16_t b0 asm("v4");
+            register int8x16_t b1 asm("v5");
+            register int8x16_t b2 asm("v6");
+            register int8x16_t b3 asm("v7");
+            register int8x16_t b0a asm("v8");
+            register int8x16_t b1a asm("v9");
+            register int8x16_t b2a asm("v10");
+            register int8x16_t b3a asm("v11");
+
+            __asm __volatile(
+                "movi    v16.4s, #0x0\n"
+                "ldr    q0, [%[a_ptr]]\n"
+                "movi    v17.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v18.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v19.4s, #0x0\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "movi    v20.4s, #0x0\n"
+                "ldr    %q[b3], [%[b_ptr], #48]\n"
+                "movi    v21.4s, #0x0\n"
+                "ldr    q1, [%[a_ptr], #16]\n"
+                "movi    v22.4s, #0x0\n"
+                "ldr    q2, [%[a_ptr], #32]\n"
+                "movi    v23.4s, #0x0\n"
+                "ldr    q3, [%[a_ptr], #48]\n"
+                "movi    v24.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v25.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v26.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v27.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v28.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]") "movi    v30.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]") "movi    v31.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+
+                // Loop structure optimized for A57 (after r0).
+
+                // Unavoidably, the multiply will "dribble" if
+                // dual issued with an add.
+
+                // Minimize the effect of this by making sure
+                // there are 2 adds to run under the dribbled
+                // multiply.
+
+                // Pipeline in blocks of 8 multiplies - combine
+                // this iteration's multiplies with adds from
+                // the previous iteration.
+
+                // So the first block doesn't have any adds to
+                // do - but because all the adds are at the
+                // start of the block it's only the first couple
+                // of multiplies that need to be pulled out.
+
+                // Start of unroll 0 (first iteration)
+                "smull    v12.8h, v0.8b, %[b0].8b\n"
+                "smull    v13.8h, v0.8b, %[b1].8b\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                // Unroll 0 continuation (branch target)
+                "1:\n"
+                "smull    v14.8h, v0.8b, %[b2].8b\n"
+                "subs    %w[k], %w[k], #1\n"
+                "smull    v15.8h, v0.8b, %[b3].8b\n"
+                "ldr    %q[b0a], [%[b_ptr], #64]\n"
+                "smlal2    v12.8h, v0.16b, %[b0].16b\n"
+                "smlal2    v13.8h, v0.16b, %[b1].16b\n"
+                "ldr    %q[b1a], [%[b_ptr], #80]\n"
+                "smlal2    v14.8h, v0.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v0.16b, %[b3].16b\n"
+                "ldr     q0, [%[a_ptr], #64]\n"
+
+                "sadalp    v16.4s, v12.8h\n"
+                "smull    v12.8h, v1.8b, %[b0].8b\n"
+                "sadalp    v17.4s, v13.8h\n"
+                "sadalp    v18.4s, v14.8h\n"
+                "smull    v13.8h, v1.8b, %[b1].8b\n"
+                "sadalp    v19.4s, v15.8h\n"
+                "smull    v14.8h, v1.8b, %[b2].8b\n"
+                "ldr    %q[b2a], [%[b_ptr], #96]\n"
+                "smull    v15.8h, v1.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v1.16b, %[b0].16b\n"
+                "ldr    %q[b3a], [%[b_ptr], #112]\n"
+                "smlal2    v13.8h, v1.16b, %[b1].16b\n"
+                "add    %[b_ptr], %[b_ptr], #128\n"
+                "smlal2    v14.8h, v1.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v1.16b, %[b3].16b\n"
+                "ldr     q1, [%[a_ptr], #80]\n"
+
+                "sadalp    v20.4s, v12.8h\n"
+                "smull    v12.8h, v2.8b, %[b0].8b\n"
+                "sadalp    v21.4s, v13.8h\n"
+                "sadalp    v22.4s, v14.8h\n"
+                "smull    v13.8h, v2.8b, %[b1].8b\n"
+                "sadalp    v23.4s, v15.8h\n"
+                "smull    v14.8h, v2.8b, %[b2].8b\n"
+                "smull    v15.8h, v2.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v2.16b, %[b0].16b\n" ASM_PREFETCH("[%[b_ptr], #192]")
+                "smlal2    v13.8h, v2.16b, %[b1].16b\n"
+                "smlal2    v14.8h, v2.16b, %[b2].16b\n" ASM_PREFETCH("[%[a_ptr], #320]")
+                "smlal2    v15.8h, v2.16b, %[b3].16b\n"
+                "ldr     q2, [%[a_ptr], #96]\n"
+
+                "sadalp    v24.4s, v12.8h\n"
+                "smull    v12.8h, v3.8b, %[b0].8b\n"
+                "sadalp    v25.4s, v13.8h\n"
+                "sadalp    v26.4s, v14.8h\n"
+                "smull    v13.8h, v3.8b, %[b1].8b\n"
+                "sadalp    v27.4s, v15.8h\n"
+                "smull    v14.8h, v3.8b, %[b2].8b\n"
+                "smull    v15.8h, v3.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v3.16b, %[b0].16b\n"
+                "ldr     %q[b0], [%[b_ptr], #0]\n"
+                "smlal2    v13.8h, v3.16b, %[b1].16b\n"
+                "smlal2    v14.8h, v3.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v3.16b, %[b3].16b\n"
+                "ldr     q3, [%[a_ptr], #112]\n"
+
+                // Unroll 1
+                "sadalp    v28.4s, v12.8h\n"
+                "smull    v12.8h, v0.8b, %[b0a].8b\n"
+                "sadalp    v29.4s, v13.8h\n"
+                "sadalp    v30.4s, v14.8h\n"
+                "smull    v13.8h, v0.8b, %[b1a].8b\n"
+                "sadalp    v31.4s, v15.8h\n"
+                "smull    v14.8h, v0.8b, %[b2a].8b\n"
+                "smull    v15.8h, v0.8b, %[b3a].8b\n"
+                "ldr     %q[b1], [%[b_ptr], #16]\n"
+                "smlal2    v12.8h, v0.16b, %[b0a].16b\n"
+                "smlal2    v13.8h, v0.16b, %[b1a].16b\n"
+                "ldr     %q[b2], [%[b_ptr], #32]\n"
+                "smlal2    v14.8h, v0.16b, %[b2a].16b\n"
+                "smlal2    v15.8h, v0.16b, %[b3a].16b\n"
+                "ldr     q0, [%[a_ptr], #128]\n"
+
+                "sadalp    v16.4s, v12.8h\n"
+                "smull    v12.8h, v1.8b, %[b0a].8b\n"
+                "sadalp    v17.4s, v13.8h\n"
+                "sadalp    v18.4s, v14.8h\n"
+                "smull    v13.8h, v1.8b, %[b1a].8b\n"
+                "sadalp    v19.4s, v15.8h\n"
+                "add    %[a_ptr], %[a_ptr], #128\n"
+                "smull    v14.8h, v1.8b, %[b2a].8b\n"
+                "smull    v15.8h, v1.8b, %[b3a].8b\n"
+                "ldr     %q[b3], [%[b_ptr], #48]\n"
+                "smlal2    v12.8h, v1.16b, %[b0a].16b\n"
+                "smlal2    v13.8h, v1.16b, %[b1a].16b\n"
+                "smlal2    v14.8h, v1.16b, %[b2a].16b\n"
+                "smlal2    v15.8h, v1.16b, %[b3a].16b\n"
+                "ldr     q1, [%[a_ptr], #16]\n"
+
+                "sadalp    v20.4s, v12.8h\n"
+                "smull    v12.8h, v2.8b, %[b0a].8b\n"
+                "sadalp    v21.4s, v13.8h\n"
+                "sadalp    v22.4s, v14.8h\n"
+                "smull    v13.8h, v2.8b, %[b1a].8b\n"
+                "sadalp    v23.4s, v15.8h\n"
+                "smull    v14.8h, v2.8b, %[b2a].8b\n"
+                "smull    v15.8h, v2.8b, %[b3a].8b\n"
+                "smlal2    v12.8h, v2.16b, %[b0a].16b\n" ASM_PREFETCH("[%[b_ptr], #256]")
+                "smlal2    v13.8h, v2.16b, %[b1a].16b\n"
+                "smlal2    v14.8h, v2.16b, %[b2a].16b\n" ASM_PREFETCH("[%[a_ptr], #256]")
+                "smlal2    v15.8h, v2.16b, %[b3a].16b\n"
+                "ldr     q2, [%[a_ptr], #32]\n"
+
+                "sadalp    v24.4s, v12.8h\n"
+                "smull    v12.8h, v3.8b, %[b0a].8b\n"
+                "sadalp    v25.4s, v13.8h\n"
+                "sadalp    v26.4s, v14.8h\n"
+                "smull    v13.8h, v3.8b, %[b1a].8b\n"
+                "sadalp    v27.4s, v15.8h\n"
+                "smull    v14.8h, v3.8b, %[b2a].8b\n"
+                "smull    v15.8h, v3.8b, %[b3a].8b\n"
+                "smlal2    v12.8h, v3.16b, %[b0a].16b\n"
+                "smlal2    v13.8h, v3.16b, %[b1a].16b\n"
+                "smlal2    v14.8h, v3.16b, %[b2a].16b\n"
+                "smlal2    v15.8h, v3.16b, %[b3a].16b\n"
+                "ldr     q3, [%[a_ptr], #48]\n"
+
+                // Start of unroll 0 for next iteration.
+                "sadalp    v28.4s, v12.8h\n"
+                "smull    v12.8h, v0.8b, %[b0].8b\n"
+                "sadalp    v29.4s, v13.8h\n"
+                "sadalp    v30.4s, v14.8h\n"
+                "smull    v13.8h, v0.8b, %[b1].8b\n"
+                "sadalp    v31.4s, v15.8h\n"
+                "bne    1b\n"
+
+                // Target to use when K=1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz    %w[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                "smull    v14.8h, v0.8b, %[b2].8b\n"
+                "smull    v15.8h, v0.8b, %[b3].8b\n"
+                "ldr    %q[b0a], [%[b_ptr], #64]\n"
+                "smlal2    v12.8h, v0.16b, %[b0].16b\n"
+                "smlal2    v13.8h, v0.16b, %[b1].16b\n"
+                "ldr    %q[b1a], [%[b_ptr], #80]\n"
+                "smlal2    v14.8h, v0.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v0.16b, %[b3].16b\n"
+                "ldr     q0, [%[a_ptr], #64]\n"
+
+                "sadalp    v16.4s, v12.8h\n"
+                "smull    v12.8h, v1.8b, %[b0].8b\n"
+                "sadalp    v17.4s, v13.8h\n"
+                "sadalp    v18.4s, v14.8h\n"
+                "smull    v13.8h, v1.8b, %[b1].8b\n"
+                "sadalp    v19.4s, v15.8h\n"
+                "smull    v14.8h, v1.8b, %[b2].8b\n"
+                "ldr    %q[b2a], [%[b_ptr], #96]\n"
+                "smull    v15.8h, v1.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v1.16b, %[b0].16b\n"
+                "ldr    %q[b3a], [%[b_ptr], #112]\n"
+                "smlal2    v13.8h, v1.16b, %[b1].16b\n"
+                "add    %[b_ptr], %[b_ptr], #128\n"
+                "smlal2    v14.8h, v1.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v1.16b, %[b3].16b\n"
+                "ldr     q1, [%[a_ptr], #80]\n"
+
+                "sadalp    v20.4s, v12.8h\n"
+                "smull    v12.8h, v2.8b, %[b0].8b\n"
+                "sadalp    v21.4s, v13.8h\n"
+                "sadalp    v22.4s, v14.8h\n"
+                "smull    v13.8h, v2.8b, %[b1].8b\n"
+                "sadalp    v23.4s, v15.8h\n"
+                "smull    v14.8h, v2.8b, %[b2].8b\n"
+                "smull    v15.8h, v2.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v2.16b, %[b0].16b\n"
+                "smlal2    v13.8h, v2.16b, %[b1].16b\n"
+                "smlal2    v14.8h, v2.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v2.16b, %[b3].16b\n"
+                "ldr     q2, [%[a_ptr], #96]\n"
+
+                "sadalp    v24.4s, v12.8h\n"
+                "smull    v12.8h, v3.8b, %[b0].8b\n"
+                "sadalp    v25.4s, v13.8h\n"
+                "sadalp    v26.4s, v14.8h\n"
+                "smull    v13.8h, v3.8b, %[b1].8b\n"
+                "sadalp    v27.4s, v15.8h\n"
+                "smull    v14.8h, v3.8b, %[b2].8b\n"
+                "smull    v15.8h, v3.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v3.16b, %[b0].16b\n"
+                "smlal2    v13.8h, v3.16b, %[b1].16b\n"
+                "smlal2    v14.8h, v3.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v3.16b, %[b3].16b\n"
+                "ldr     q3, [%[a_ptr], #112]\n"
+
+                // Unroll 1
+                "sadalp    v28.4s, v12.8h\n"
+                "smull    v12.8h, v0.8b, %[b0a].8b\n"
+                "sadalp    v29.4s, v13.8h\n"
+                "sadalp    v30.4s, v14.8h\n"
+                "smull    v13.8h, v0.8b, %[b1a].8b\n"
+                "sadalp    v31.4s, v15.8h\n"
+                "smull    v14.8h, v0.8b, %[b2a].8b\n"
+                "add    %[a_ptr], %[a_ptr], #128\n"
+                "smull    v15.8h, v0.8b, %[b3a].8b\n"
+                "smlal2    v12.8h, v0.16b, %[b0a].16b\n"
+                "smlal2    v13.8h, v0.16b, %[b1a].16b\n"
+                "smlal2    v14.8h, v0.16b, %[b2a].16b\n"
+                "smlal2    v15.8h, v0.16b, %[b3a].16b\n"
+
+                "sadalp    v16.4s, v12.8h\n"
+                "smull    v12.8h, v1.8b, %[b0a].8b\n"
+                "sadalp    v17.4s, v13.8h\n"
+                "sadalp    v18.4s, v14.8h\n"
+                "smull    v13.8h, v1.8b, %[b1a].8b\n"
+                "sadalp    v19.4s, v15.8h\n"
+                "smull    v14.8h, v1.8b, %[b2a].8b\n"
+                "smull    v15.8h, v1.8b, %[b3a].8b\n"
+                "smlal2    v12.8h, v1.16b, %[b0a].16b\n"
+                "addp    v16.4s, v16.4s, v17.4s\n"
+                "smlal2    v13.8h, v1.16b, %[b1a].16b\n"
+                "addp    v17.4s, v18.4s, v19.4s\n"
+                "smlal2    v14.8h, v1.16b, %[b2a].16b\n"
+                "smlal2    v15.8h, v1.16b, %[b3a].16b\n"
+
+                "sadalp    v20.4s, v12.8h\n"
+                "smull    v12.8h, v2.8b, %[b0a].8b\n"
+                "sadalp    v21.4s, v13.8h\n"
+                "sadalp    v22.4s, v14.8h\n"
+                "smull    v13.8h, v2.8b, %[b1a].8b\n"
+                "sadalp    v23.4s, v15.8h\n"
+                "addp    v16.4s, v16.4s, v17.4s\n"
+                "smull    v14.8h, v2.8b, %[b2a].8b\n"
+                "addp    v18.4s, v20.4s, v21.4s\n"
+                "addp    v19.4s, v22.4s, v23.4s\n"
+                "smull    v15.8h, v2.8b, %[b3a].8b\n"
+                "smlal2    v12.8h, v2.16b, %[b0a].16b\n"
+                "str    q16, [%[c_ptr]]\n"
+                "smlal2    v13.8h, v2.16b, %[b1a].16b\n"
+                "smlal2    v14.8h, v2.16b, %[b2a].16b\n"
+                "smlal2    v15.8h, v2.16b, %[b3a].16b\n"
+
+                "sadalp    v24.4s, v12.8h\n"
+                "smull    v12.8h, v3.8b, %[b0a].8b\n"
+                "sadalp    v25.4s, v13.8h\n"
+                "sadalp    v26.4s, v14.8h\n"
+                "smull    v13.8h, v3.8b, %[b1a].8b\n"
+                "sadalp    v27.4s, v15.8h\n"
+                "addp    v17.4s, v18.4s, v19.4s\n"
+                "smull    v14.8h, v3.8b, %[b2a].8b\n"
+                "addp    v20.4s, v24.4s, v25.4s\n"
+                "addp    v21.4s, v26.4s, v27.4s\n"
+                "smull    v15.8h, v3.8b, %[b3a].8b\n"
+                "smlal2    v12.8h, v3.16b, %[b0a].16b\n"
+                "str    q17, [%[c_ptr], #16]\n"
+                "smlal2    v13.8h, v3.16b, %[b1a].16b\n"
+                "smlal2    v14.8h, v3.16b, %[b2a].16b\n"
+                "addp    v18.4s, v20.4s, v21.4s\n"
+                "smlal2    v15.8h, v3.16b, %[b3a].16b\n"
+                "b    3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                "smull    v14.8h, v0.8b, %[b2].8b\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "smull    v15.8h, v0.8b, %[b3].8b\n"
+                "add    %[b_ptr], %[b_ptr], #64\n"
+                "smlal2    v12.8h, v0.16b, %[b0].16b\n"
+                "smlal2    v13.8h, v0.16b, %[b1].16b\n"
+                "smlal2    v14.8h, v0.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v0.16b, %[b3].16b\n"
+
+                "sadalp    v16.4s, v12.8h\n"
+                "smull    v12.8h, v1.8b, %[b0].8b\n"
+                "sadalp    v17.4s, v13.8h\n"
+                "sadalp    v18.4s, v14.8h\n"
+                "smull    v13.8h, v1.8b, %[b1].8b\n"
+                "sadalp    v19.4s, v15.8h\n"
+                "smull    v14.8h, v1.8b, %[b2].8b\n"
+                "smull    v15.8h, v1.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v1.16b, %[b0].16b\n"
+                "addp    v16.4s, v16.4s, v17.4s\n"
+                "smlal2    v13.8h, v1.16b, %[b1].16b\n"
+                "addp    v17.4s, v18.4s, v19.4s\n"
+                "smlal2    v14.8h, v1.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v1.16b, %[b3].16b\n"
+
+                "sadalp    v20.4s, v12.8h\n"
+                "smull    v12.8h, v2.8b, %[b0].8b\n"
+                "sadalp    v21.4s, v13.8h\n"
+                "sadalp    v22.4s, v14.8h\n"
+                "smull    v13.8h, v2.8b, %[b1].8b\n"
+                "sadalp    v23.4s, v15.8h\n"
+                "addp    v16.4s, v16.4s, v17.4s\n"
+                "smull    v14.8h, v2.8b, %[b2].8b\n"
+                "addp    v18.4s, v20.4s, v21.4s\n"
+                "addp    v19.4s, v22.4s, v23.4s\n"
+                "smull    v15.8h, v2.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v2.16b, %[b0].16b\n"
+                "str    q16, [%[c_ptr]]\n"
+                "smlal2    v13.8h, v2.16b, %[b1].16b\n"
+                "smlal2    v14.8h, v2.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v2.16b, %[b3].16b\n"
+
+                "sadalp    v24.4s, v12.8h\n"
+                "smull    v12.8h, v3.8b, %[b0].8b\n"
+                "sadalp    v25.4s, v13.8h\n"
+                "sadalp    v26.4s, v14.8h\n"
+                "smull    v13.8h, v3.8b, %[b1].8b\n"
+                "sadalp    v27.4s, v15.8h\n"
+                "addp    v17.4s, v18.4s, v19.4s\n"
+                "smull    v14.8h, v3.8b, %[b2].8b\n"
+                "addp    v20.4s, v24.4s, v25.4s\n"
+                "addp    v21.4s, v26.4s, v27.4s\n"
+                "smull    v15.8h, v3.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v3.16b, %[b0].16b\n"
+                "str    q17, [%[c_ptr], #16]\n"
+                "smlal2    v13.8h, v3.16b, %[b1].16b\n"
+                "smlal2    v14.8h, v3.16b, %[b2].16b\n"
+                "addp    v18.4s, v20.4s, v21.4s\n"
+                "smlal2    v15.8h, v3.16b, %[b3].16b\n"
+
+                "3:\n"
+
+                // Final additions
+                "sadalp    v28.4s, v12.8h\n"
+                "str    q18, [%[c_ptr], #32]\n"
+                "sadalp    v29.4s, v13.8h\n"
+                "sadalp    v30.4s, v14.8h\n"
+                "sadalp    v31.4s, v15.8h\n"
+
+                // Horizontal reduction, phase 1
+                "addp    v22.4s, v28.4s, v29.4s\n"
+                "addp    v23.4s, v30.4s, v31.4s\n"
+
+                // Horizontal reduction, phase 2
+                "addp    v19.4s, v22.4s, v23.4s\n"
+                "str    q19, [%[c_ptr], #48]\n"
+                "add    %[c_ptr], %[c_ptr], #64\n"
+
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [b3] "+w"(b3),
+                [b0a] "+w"(b0a), [b1a] "+w"(b1a), [b2a] "+w"(b2a), [b3a] "+w"(b3a),
+                [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v0", "v1", "v2", "v3", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+                "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
new file mode 100644
index 0000000..3975732
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp

@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_gemm_u16_asimd_12x8(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics.  The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class gemm_u16_12x8
+{
+public:
+    typedef uint16_t operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int A_interleave = 8;
+    static const int A_block      = 1;
+    static const int A_transpose  = 0;
+
+    /* Same for B input */
+    static const int B_interleave = 12;
+    static const int B_block      = 1;
+    static const int B_transpose  = 1;
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 12;
+    static const int out_height = 8;
+    static const int k_unroll   = 1;
+
+    kern_type kernel = a64_gemm_u16_asimd_12x8;
+
+    gemm_u16_12x8(const CPUInfo *ci)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp
new file mode 100644
index 0000000..7903878
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp

@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+    const uint16_t *a_ptr = Apanel;
+    uint32_t       *c_ptr = Cpanel;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const uint16_t *a_ptr0 = a_ptr;
+        const uint16_t *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr            = a_ptr0;
+            const bool odd_k = K & 0x1;
+            int        k     = (K + 1) / 2 - 1;
+
+            register uint16x8_t aa asm("v0");
+            register uint16x8_t ab asm("v1");
+            register uint16x8_t b0 asm("v2");
+            register uint16x8_t b1 asm("v3");
+            register uint16x8_t b2 asm("v4");
+
+            __asm __volatile(
+                "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower
+                "movi v5.4s, #0\n"
+                "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper
+                "movi v6.4s, #0\n"
+                "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower
+                "ins %[aa].d[1], x20\n"     // Merge A[A].lower and upper
+                "movi v7.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi v8.4s, #0\n"
+                "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper
+                "movi v9.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi v10.4s, #0\n"
+                "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower
+                "ins %[b0].d[1], x20\n"            // Merge B[0].lower and upper
+                "movi v11.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #96]")
+                "movi v12.4s, #0\n"
+                "movi v13.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #96]")
+                "movi v14.4s, #0\n"
+                "movi v15.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi v16.4s, #0\n"
+                "movi v17.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi v18.4s, #0\n"
+                "movi v19.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #160]")
+                "movi v20.4s, #0\n"
+                "movi v21.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #160]")
+                "movi v22.4s, #0\n"
+                "movi v23.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi v24.4s, #0\n"
+                "add %x[a_ptr], %x[a_ptr], #0x10\n"
+                "movi v25.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi v26.4s, #0\n"
+                "add %x[b_ptr], %x[b_ptr], #0x18\n"
+                "movi v27.4s, #0\n"
+                "movi v28.4s, #0\n"
+
+                "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations.
+
+                "1:\n" // Main loop
+                // First unroll
+                "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+                "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
+                "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+                "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+                "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
+                "ins %[b1].d[1], x20\n"     // Merge B[1].lower and .upper
+                "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+                "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+                "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
+                "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+                "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+                "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
+                "ins %[ab].d[1], x20\n"           // Merge A[B].lower and .upper
+                "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+                "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+                "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
+                "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+                "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+                "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+                "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+                "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+                "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+                "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+                "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower
+                "ins %[b2].d[1], x20\n"            // Merge B[2].lower and .upper
+                "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+                "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+                "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper
+                "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+                "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+                "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+                "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+                "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+                "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+                // Second unroll
+                "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+                "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower
+                "ins %[b0].d[1], x20\n"            // Merge B[0].lower and .upper
+                "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+                "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+                "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper
+                "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+                "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+                "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+                "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+                "add %x[a_ptr], %x[a_ptr], #0x20\n"
+                "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+                "umlal v13.4s, %[b2].4h, %[ab].h[0]\n" ASM_PREFETCH("[%[b_ptr], #320]")
+                "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+                "umlal v15.4s, %[b2].4h, %[ab].h[2]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+                "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+                "umlal v17.4s, %[b2].4h, %[ab].h[4]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+                "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+                "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+                "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+                "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+                "subs %x[k], %x[k], #0x1\n"
+                "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+                "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+                "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower
+                "ins %[aa].d[1], x20\n"            // Merge A[A].lower and .upper
+                "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+                "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+                "add %x[b_ptr], %x[b_ptr], #0x30\n"
+                "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+                "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+                "bne 1b\n"
+
+                "2:\n" // Even tail
+                "cbnz %x[odd_k], 3f\n"
+
+                "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+                "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
+                "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+                "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+                "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
+                "ins %[b1].d[1], x20\n"     // Merge B[1].lower and .upper
+                "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+                "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+                "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
+                "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+                "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+                "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
+                "ins %[ab].d[1], x20\n"           // Merge A[B].lower and .upper
+                "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+                "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+                "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
+                "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+                "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+                "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+                "add %[a_ptr], %[a_ptr], #0x10\n"
+                "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+                "add %[b_ptr], %[b_ptr], #0x18\n"
+                "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+                "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+                "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+                "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
+                "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+                "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+                "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+                "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+                "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+                "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+                "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+                "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+                "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+                "umlal v13.4s, %[b2].4h, %[ab].h[0]\n"
+                "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+                "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+                "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+                "str q5, [%x[c_ptr]]\n"
+                "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+                "str q13, [%x[c_ptr], #0x10]\n"
+                "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+                "str q21, [%x[c_ptr], #0x20]\n"
+                "umlal v15.4s, %[b2].4h, %[ab].h[2]\n"
+                "str q6, [%x[c_ptr], #0x30]\n"
+                "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+                "str q14, [%x[c_ptr], #0x40]\n"
+                "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+                "str q22, [%x[c_ptr], #0x50]\n"
+                "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+                "str q7, [%x[c_ptr], #0x60]\n"
+                "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+                "str q15, [%x[c_ptr], #0x70]\n"
+                "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+                "str q23, [%x[c_ptr], #0x80]\n"
+                "umlal v17.4s, %[b2].4h, %[ab].h[4]\n"
+                "str q8, [%x[c_ptr], #0x90]\n"
+                "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+                "str q16, [%x[c_ptr], #0xa0]\n"
+                "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+                "str q24, [%x[c_ptr], #0xb0]\n"
+                "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+                "str q9, [%x[c_ptr], #0xc0]\n"
+                "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+                "str q17, [%x[c_ptr], #0xd0]\n"
+                "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+                "str q25, [%x[c_ptr], #0xe0]\n"
+                "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+                "str q10, [%x[c_ptr], #0xf0]\n"
+                "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+                "str q18, [%x[c_ptr], #0x100]\n"
+                "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+                "str q26, [%x[c_ptr], #0x110]\n"
+                "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+                "str q11, [%x[c_ptr], #0x120]\n"
+                "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+                "str q19, [%x[c_ptr], #0x130]\n"
+                "b 4f\n" // Complete write out
+
+                "3:\n" // Odd tail
+                "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+                "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+                "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+                "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+                "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+                "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+                "str q5, [%x[c_ptr]]\n"
+                "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+                "str q13, [%x[c_ptr], #0x10]\n"
+                "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+                "str q21, [%x[c_ptr], #0x20]\n"
+                "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+                "str q6, [%x[c_ptr], #0x30]\n"
+                "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+                "str q14, [%x[c_ptr], #0x40]\n"
+                "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+                "str q22, [%x[c_ptr], #0x50]\n"
+                "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+                "str q7, [%x[c_ptr], #0x60]\n"
+                "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+                "str q15, [%x[c_ptr], #0x70]\n"
+                "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+                "str q23, [%x[c_ptr], #0x80]\n"
+                "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+                "str q8, [%x[c_ptr], #0x90]\n"
+                "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+                "str q16, [%x[c_ptr], #0xa0]\n"
+                "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+                "str q24, [%x[c_ptr], #0xb0]\n"
+                "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+                "str q9, [%x[c_ptr], #0xc0]\n"
+                "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+                "str q17, [%x[c_ptr], #0xd0]\n"
+                "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+                "str q25, [%x[c_ptr], #0xe0]\n"
+                "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+                "str q10, [%x[c_ptr], #0xf0]\n"
+                "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+                "str q18, [%x[c_ptr], #0x100]\n"
+                "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+                "str q26, [%x[c_ptr], #0x110]\n"
+                "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+                "str q11, [%x[c_ptr], #0x120]\n"
+
+                "4:\n" // End of function
+                "str q19, [%x[c_ptr], #0x130]\n"
+                "str q27, [%x[c_ptr], #0x140]\n"
+                "str q12, [%x[c_ptr], #0x150]\n"
+                "str q20, [%x[c_ptr], #0x160]\n"
+                "str q28, [%x[c_ptr], #0x170]\n"
+                "add %x[c_ptr], %x[c_ptr], #0x180\n"
+                : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k),
+                [aa] "+w"(aa), [ab] "+w"(ab), [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2)
+                : [odd_k] "r"(odd_k)
+                : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
new file mode 100644
index 0000000..26255b1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+
+namespace arm_gemm
+{
+// Load the actual kernel
+void a64_gemm_u8_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_12x8_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+
+class gemm_u8_12x8
+{
+public:
+    typedef uint8_t  operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int  A_interleave = 8;
+    static const int  A_block      = 4;
+    static const bool A_transpose  = false;
+
+    /* Same for B input */
+    static const int  B_interleave = 12;
+    static const int  B_block      = 4;
+    static const bool B_transpose  = true;
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 12;
+    static const int out_height = 8;
+    static const int k_unroll   = 4;
+
+    kern_type kernel = a64_gemm_u8_12x8;
+
+    gemm_u8_12x8(const CPUInfo *ci)
+    {
+        if(ci->get_cpu_model() == CPUModel::A55r1)
+        {
+            kernel = a64_gemm_u8_12x8_a55r1;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp
new file mode 100644
index 0000000..f8fafbd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp

@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+#include "dot_toolchain_support.h"
+#endif
+
+namespace arm_gemm
+{
+void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, const int ablocks, const int bblocks, const int K)
+{
+    const uint8_t *a_ptr = Apanel;
+    uint32_t      *c_ptr = Cpanel;
+
+    // We divide K by 4 because the udot instruction processes 4 elements at a time.
+    const int W = K / 4;
+
+    // Fix up for odd lengths - set a flag if K is odd, but make
+    // sure we round up the iteration count.
+    const int oddk    = (W & 1);
+    const int k_iters = ((W + 1) / 2) - 1;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const uint8_t *a_ptr0 = a_ptr;
+        const uint8_t *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+            int k = k_iters;
+
+            register int32x4_t a0 asm("v0");
+            register int32x4_t a1 asm("v1");
+            register int32x4_t b0 asm("v2");
+            register int32x4_t b1 asm("v3");
+            register int32x4_t b2 asm("v4");
+            register int32x4_t a0a asm("v5");
+            register int32x4_t a1a asm("v6");
+
+            __asm __volatile(
+#ifdef NO_DOT_IN_TOOLCHAIN
+                _DECLARE_UDOT
+#else
+                ".arch armv8.2-a+dotprod\n"
+#endif
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi   v8.4s, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi   v9.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi   v10.4s, #0x0\n"
+                "ldr    %q[a1], [%[a_ptr], #16]\n"
+                "movi   v11.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi   v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi   v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi   v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi   v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi   v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi   v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi   v18.4s, #0x0\n"
+                "movi   v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi   v20.4s, #0x0\n"
+                "movi   v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi   v22.4s, #0x0\n"
+                "movi   v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi   v24.4s, #0x0\n"
+                "movi   v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi   v26.4s, #0x0\n"
+                "movi   v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "movi   v28.4s, #0x0\n"
+                "movi   v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]")
+                "movi   v30.4s, #0x0\n"
+                "movi   v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]")
+
+                // The loop is offset by these two instructions which must
+                // always be executed.
+                "udot   v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                "1:\n"
+                "udot      v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "udot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "udot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+
+                "udot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "udot    v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "udot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "udot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+
+                "udot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "udot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "udot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+
+                "udot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "udot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "udot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "udot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+
+                "udot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "udot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "udot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot    v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCH("[%[a_ptr], #448]")
+
+                "udot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "udot    v29.4s, %[b2].16b, %[a1].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #576]")
+                "udot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "udot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+                // Unroll 1
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+
+                "udot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "udot    v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "udot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "udot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "ldr    %d[a0], [%[a_ptr], #64]\n"
+
+                "udot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #72]\n"
+                "udot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "udot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "ldr    %d[a1], [%[a_ptr], #80]\n"
+
+                "udot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "ins    %[a0].d[1], x20\n"
+                "udot    v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #88]\n"
+                "udot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "udot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #96]\n"
+
+                "udot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "ins    %[a1].d[1], x20\n"
+                "udot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #104]\n"
+                "udot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "udot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #112]\n"
+
+                "udot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "udot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #120]\n"
+                "udot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "udot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+
+                "udot    v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCH("[%[b_ptr], #640]")
+                "udot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "udot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "ins    %[b1].d[1], x20\n"
+                "udot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+
+                "udot   v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "b.ne    1b\n"
+
+                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
+                "4:\n"
+
+                // Start final iteration - branch off to "odd" code before we load a0a.
+                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "udot   v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "cbnz   %w[oddk], 2f\n"
+
+                // Even K continuation
+                "udot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+
+                "udot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "udot    v14.4s, %[b0].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr]]")
+                "udot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+
+                "udot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "udot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "udot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+
+                "udot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "udot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "udot    v22.4s, %[b1].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "udot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+
+                "udot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "udot    v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+                "udot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+
+                "udot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "udot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "udot    v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "udot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+
+                "udot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "udot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "udot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "ins    %[b2].d[1], x20\n"
+
+                "udot   v11.4s, %[b0].16b, %[a0a].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "udot   v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "udot   v14.4s, %[b0].16b, %[a1a].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "udot   v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "udot   v16.4s, %[b1].16b, %[a0a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "udot   v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "udot   v18.4s, %[b1].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "udot   v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "udot   v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "udot   v21.4s, %[b1].16b, %[a1a].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+                "udot   v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "udot   v23.4s, %[b1].16b, %[a1a].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+                "udot   v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "udot   v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "udot   v26.4s, %[b2].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+                "udot   v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "udot   v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "udot   v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "udot   v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "udot   v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "b      3f\n"
+
+                // Odd K continuation
+                "2:\n"
+                "udot   v11.4s, %[b0].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+                "udot   v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "udot   v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "udot   v15.4s, %[b0].16b, %[a1].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+                "udot   v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "udot   v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "udot   v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot   v19.4s, %[b1].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "udot   v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "udot   v21.4s, %[b1].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "udot   v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "udot   v23.4s, %[b1].16b, %[a1].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "udot   v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "udot   v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "udot   v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot   v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "udot   v28.4s, %[b2].16b, %[a1].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "udot   v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #640]") "udot   v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "udot   v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+                // Common tail
+                "3:\n"
+                "str    q8,   [%[c_ptr]]\n"
+                "str    q16,  [%[c_ptr], #16]\n"
+                "str    q24,  [%[c_ptr], #32]\n"
+                "str    q9,   [%[c_ptr], #48]\n"
+                "str    q17,  [%[c_ptr], #64]\n"
+                "str    q25,  [%[c_ptr], #80]\n"
+                "str    q10,  [%[c_ptr], #96]\n"
+                "str    q18,  [%[c_ptr], #112]\n"
+                "str    q26,  [%[c_ptr], #128]\n"
+                "str    q11,  [%[c_ptr], #144]\n"
+                "str    q19,  [%[c_ptr], #160]\n"
+                "str    q27,  [%[c_ptr], #176]\n"
+                "str    q12,  [%[c_ptr], #192]\n"
+                "str    q20,  [%[c_ptr], #208]\n"
+                "str    q28,  [%[c_ptr], #224]\n"
+                "str    q13,  [%[c_ptr], #240]\n"
+                "str    q21,  [%[c_ptr], #256]\n"
+                "str    q29,  [%[c_ptr], #272]\n"
+                "str    q14,  [%[c_ptr], #288]\n"
+                "str    q22,  [%[c_ptr], #304]\n"
+                "str    q30,  [%[c_ptr], #320]\n"
+                "str    q15,  [%[c_ptr], #336]\n"
+                "str    q23,  [%[c_ptr], #352]\n"
+                "str    q31,  [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+                ".purgem udot\n"
+#endif
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif
\ No newline at end of file

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h
new file mode 100644
index 0000000..5ee273b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h

@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
+#define _DECLARE_UDOT                                                                                  \
+    ".altmacro\n"                                                                                      \
+    ".macro udot opd:req, opn:req, opm:req\n"                                                          \
+    "local vd, vn, vm, h, l\n"                                                                         \
+    ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n" \
+    ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"                                                               \
+    ".set vd,\\reg\n"                                                                                  \
+    ".endif\n"                                                                                         \
+    ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"                                                              \
+    ".set vn,\\reg\n"                                                                                  \
+    ".endif\n"                                                                                         \
+    ".irp idx,0,1,2,3\n"                                                                               \
+    ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"                                                      \
+    ".set vm,\\reg\n"                                                                                  \
+    ".set h,\\idx / 2\n"                                                                               \
+    ".set l,\\idx %% 2\n"                                                                              \
+    ".endif\n"                                                                                         \
+    ".endr\n"                                                                                          \
+    ".endr\n"                                                                                          \
+    ".ifndef vd\n"                                                                                     \
+    ".error \"Bad operand \\opd\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".ifndef vn\n"                                                                                     \
+    ".error \"Bad operand \\opn\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".ifndef vm\n"                                                                                     \
+    ".error \"Bad operand \\opm\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".ifndef h\n"                                                                                      \
+    ".error \"Bad operand \\opm\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".ifndef l\n"                                                                                      \
+    ".error \"Bad operand \\opm\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".int     0x6f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"                      \
+    ".endm\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp
new file mode 100644
index 0000000..d026dc5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp

@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+#include "dot_toolchain_support.h"
+#endif
+
+namespace arm_gemm
+{
+void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+    const uint8_t *a_ptr = Apanel;
+    uint32_t      *c_ptr = Cpanel;
+    // We divide K by 4 because the udot instruction processes 4 elements at a time.
+    const int W = K / 4;
+    // Fix up for odd lengths - set a flag if K is odd, but make
+    // sure we round up the iteration count.
+    const int oddk         = (W & 1);
+    const int init_value_k = ((W + 1) / 2) - 1;
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const uint8_t *a_ptr0 = a_ptr;
+        const uint8_t *b_ptr  = Bpanel;
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr                 = a_ptr0;
+            int                 k = init_value_k;
+            register uint8x16_t a0 asm("v0");
+            register uint8x16_t a1 asm("v1");
+            register uint8x16_t b0 asm("v2");
+            register uint8x16_t b1 asm("v3");
+            register uint8x16_t b2 asm("v4");
+            register uint8x16_t a0a asm("v5");
+            register uint8x16_t a1a asm("v6");
+            __asm __volatile(
+#ifdef NO_DOT_IN_TOOLCHAIN
+                _DECLARE_UDOT
+#else
+                ".arch  armv8.2-a+dotprod\n"
+#endif
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi    v8.4s, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi    v9.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v10.4s, #0x0\n"
+                "ldr    %q[a1], [%[a_ptr], #16]\n"
+                "movi    v11.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]") "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi    v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi    v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi    v22.4s, #0x0\n"
+                "movi    v23.4s, #0x0\n"
+                "movi    v24.4s, #0x0\n"
+                "movi    v25.4s, #0x0\n"
+                "movi    v26.4s, #0x0\n"
+                "movi    v27.4s, #0x0\n"
+                "movi    v28.4s, #0x0\n"
+                "movi    v29.4s, #0x0\n"
+                "movi    v30.4s, #0x0\n"
+                "movi    v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                // Loop proper
+                "1:\n"
+                "udot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "udot      v9.4s , %[b0].16b, %[a0].4b[1]\n"
+
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "udot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "udot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr    %q[a0a], [%[a_ptr], #32]\n"
+                "udot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "udot    v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    %q[a1a], [%[a_ptr], #48]\n"
+                "udot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "udot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #48]\n"
+
+                "udot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "udot    v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+                "udot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "udot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "udot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "udot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "udot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #64]\n"
+
+                "udot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "udot    v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "udot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "udot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "udot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "udot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "udot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr    %q[b2], [%[b_ptr], #80]\n"
+
+                "udot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "udot    v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr    %q[a0], [%[a_ptr], #64]\n"
+                "udot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "udot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "udot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "ldr    %q[a1], [%[a_ptr], #80]\n"
+                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "udot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "udot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #96]\n"
+
+                "udot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "udot    v17.4s, %[b1].16b, %[a0a].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
+                "udot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "udot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "udot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "udot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "udot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "udot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #112]\n"
+
+                "udot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "udot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "udot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "udot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "udot    v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "udot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "udot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "udot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "bne    1b\n"
+
+                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz    %w[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                "udot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "udot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "udot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr    %q[a0a], [%[a_ptr], #32]\n"
+                "udot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    %q[a1a], [%[a_ptr], #48]\n"
+                "udot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "udot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #48]\n"
+
+                "udot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "udot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "udot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "udot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "udot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "udot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "udot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #64]\n"
+
+                "udot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "udot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "udot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "udot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "udot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "udot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "udot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr    %q[b2], [%[b_ptr], #80]\n"
+
+                "udot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+
+                "udot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "udot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "str    q8, [%[c_ptr], #0]\n"
+                "udot    v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+                "udot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "str    q24, [%[c_ptr], #32]\n"
+
+                "udot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "str    q9, [%[c_ptr], #48]\n"
+                "udot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+                "udot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "udot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+
+                "udot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+                "udot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "udot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+
+                "udot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+                "udot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "udot    v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+
+                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+                "udot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "udot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+
+                "udot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+                "udot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "udot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+
+                "udot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+                "udot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "udot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+
+                "b    3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                "udot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "udot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "str    q8, [%[c_ptr], #0]\n"
+                "udot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+                "udot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "str    q24, [%[c_ptr], #32]\n"
+                "udot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "str    q9, [%[c_ptr], #48]\n"
+
+                "udot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+                "udot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "udot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+
+                "udot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+                "udot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "udot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+
+                "udot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+                "udot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "udot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+
+                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+                "udot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "udot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+
+                "udot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+                "udot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "udot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+
+                "udot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+                "udot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "udot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+
+                // Common tail
+                "3:\n"
+                "str    q23, [%[c_ptr], #352]\n"
+                "str    q31, [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+                ".purgem udot\n"
+#endif
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
new file mode 100644
index 0000000..5aa5291
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp

@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Kernel definition
+void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K);
+
+class gemm_u8_4x4
+{
+public:
+    typedef uint8_t  operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int  A_interleave = 4;
+    static const int  A_block      = 16;
+    static const bool A_transpose  = false;
+
+    /* Same for B input */
+    static const int  B_interleave = 4;
+    static const int  B_block      = 16;
+    static const bool B_transpose  = true;
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 4;
+    static const int out_height = 4;
+    static const int k_unroll   = 16;
+
+    kern_type kernel = nullptr;
+
+    gemm_u8_4x4(const CPUInfo *ci)
+    {
+        kernel = a64_gemm_u8_4x4;
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp
new file mode 100644
index 0000000..0a881ff
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp

@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+    const uint8_t *a_ptr = Apanel;
+    uint32_t      *c_ptr = Cpanel;
+    K /= 16;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const uint8_t *a_ptr0 = a_ptr;
+        const uint8_t *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+
+            int k = K - 1;
+
+            register uint8x16_t b0 asm("v4");
+            register uint8x16_t b1 asm("v5");
+            register uint8x16_t b2 asm("v6");
+            register uint8x16_t b3 asm("v7");
+
+            __asm __volatile(
+                "movi    v16.4s, #0x0\n"
+                "ldr    q0, [%[a_ptr]]\n"
+                "movi    v17.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v18.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v19.4s, #0x0\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "movi    v20.4s, #0x0\n"
+                "ldr    %q[b3], [%[b_ptr], #48]\n"
+                "movi    v21.4s, #0x0\n"
+                "ldr    q1, [%[a_ptr], #16]\n"
+                "movi    v22.4s, #0x0\n"
+                "ldr    q2, [%[a_ptr], #32]\n"
+                "movi    v23.4s, #0x0\n"
+                "ldr    q3, [%[a_ptr], #48]\n"
+                "movi    v24.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v25.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v26.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v27.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v28.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]") "movi    v30.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]") "movi    v31.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+
+                "umull    v12.8h, v0.8b, %[b0].8b\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "umull    v13.8h, v0.8b, %[b1].8b\n"
+                "umull    v14.8h, v0.8b, %[b2].8b\n"
+                "add    %[b_ptr], %[b_ptr], #64\n"
+                "umull    v15.8h, v0.8b, %[b3].8b\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 2f\n"
+
+                "1:\n"
+                "uadalp    v16.4s, v12.8h\n"
+                "umull2    v12.8h, v0.16b, %[b0].16b\n"
+                "uadalp    v17.4s, v13.8h\n"
+                "umull2    v13.8h, v0.16b, %[b1].16b\n"
+                "uadalp    v18.4s, v14.8h\n"
+                "umull2    v14.8h, v0.16b, %[b2].16b\n"
+                "uadalp    v19.4s, v15.8h\n"
+                "umull2    v15.8h, v0.16b, %[b3].16b\n"
+                "ldr     q0, [%[a_ptr]]\n"
+
+                "uadalp    v16.4s, v12.8h\n"
+                "umull    v12.8h, v1.8b, %[b0].8b\n"
+                "uadalp    v17.4s, v13.8h\n"
+                "umull    v13.8h, v1.8b, %[b1].8b\n"
+                "subs    %w[k], %w[k], #1\n"
+                "uadalp    v18.4s, v14.8h\n"
+                "umull    v14.8h, v1.8b, %[b2].8b\n"
+                "uadalp    v19.4s, v15.8h\n"
+                "umull    v15.8h, v1.8b, %[b3].8b\n"
+
+                "uadalp    v20.4s, v12.8h\n"
+                "umull2    v12.8h, v1.16b, %[b0].16b\n"
+                "uadalp    v21.4s, v13.8h\n"
+                "umull2    v13.8h, v1.16b, %[b1].16b\n" ASM_PREFETCH("[%[a_ptr], #256]")
+                "uadalp    v22.4s, v14.8h\n"
+                "umull2    v14.8h, v1.16b, %[b2].16b\n"
+                "uadalp    v23.4s, v15.8h\n"
+                "umull2    v15.8h, v1.16b, %[b3].16b\n"
+                "ldr     q1, [%[a_ptr], #16]\n"
+
+                "uadalp    v20.4s, v12.8h\n"
+                "umull    v12.8h, v2.8b, %[b0].8b\n"
+                "uadalp    v21.4s, v13.8h\n"
+                "umull    v13.8h, v2.8b, %[b1].8b\n" ASM_PREFETCH("[%[b_ptr], #256]")
+                "uadalp    v22.4s, v14.8h\n"
+                "umull    v14.8h, v2.8b, %[b2].8b\n"
+                "uadalp    v23.4s, v15.8h\n"
+                "umull    v15.8h, v2.8b, %[b3].8b\n"
+
+                "uadalp    v24.4s, v12.8h\n"
+                "umull2    v12.8h, v2.16b, %[b0].16b\n"
+                "uadalp    v25.4s, v13.8h\n"
+                "umull2    v13.8h, v2.16b, %[b1].16b\n"
+                "uadalp    v26.4s, v14.8h\n"
+                "umull2    v14.8h, v2.16b, %[b2].16b\n"
+                "uadalp    v27.4s, v15.8h\n"
+                "umull2    v15.8h, v2.16b, %[b3].16b\n"
+                "ldr    q2, [%[a_ptr], #32]\n"
+
+                "uadalp    v24.4s, v12.8h\n"
+                "umull    v12.8h, v3.8b, %[b0].8b\n"
+                "uadalp    v25.4s, v13.8h\n"
+                "umull    v13.8h, v3.8b, %[b1].8b\n"
+                "uadalp    v26.4s, v14.8h\n"
+                "umull    v14.8h, v3.8b, %[b2].8b\n"
+                "uadalp    v27.4s, v15.8h\n"
+                "umull    v15.8h, v3.8b, %[b3].8b\n"
+
+                "uadalp    v28.4s, v12.8h\n"
+                "umull2    v12.8h, v3.16b, %[b0].16b\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "uadalp    v29.4s, v13.8h\n"
+                "umull2    v13.8h, v3.16b, %[b1].16b\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "uadalp    v30.4s, v14.8h\n"
+                "umull2    v14.8h, v3.16b, %[b2].16b\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "uadalp    v31.4s, v15.8h\n"
+                "umull2    v15.8h, v3.16b, %[b3].16b\n"
+                "ldr    %q[b3], [%[b_ptr], #48]\n"
+
+                "uadalp    v28.4s, v12.8h\n"
+                "umull    v12.8h, v0.8b, %[b0].8b\n"
+                "add    %[b_ptr], %[b_ptr], #64\n"
+                "uadalp    v29.4s, v13.8h\n"
+                "umull    v13.8h, v0.8b, %[b1].8b\n"
+                "ldr    q3, [%[a_ptr], #48]\n"
+                "uadalp    v30.4s, v14.8h\n"
+                "umull    v14.8h, v0.8b, %[b2].8b\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "uadalp    v31.4s, v15.8h\n"
+                "umull    v15.8h, v0.8b, %[b3].8b\n"
+                "bne    1b\n"
+
+                // Branch target
+                "2:\n"
+                "uadalp    v16.4s, v12.8h\n"
+                "umull2    v12.8h, v0.16b, %[b0].16b\n"
+                "uadalp    v17.4s, v13.8h\n"
+                "umull2    v13.8h, v0.16b, %[b1].16b\n"
+                "uadalp    v18.4s, v14.8h\n"
+                "umull2    v14.8h, v0.16b, %[b2].16b\n"
+                "uadalp    v19.4s, v15.8h\n"
+                "umull2    v15.8h, v0.16b, %[b3].16b\n"
+
+                "uadalp    v16.4s, v12.8h\n"
+                "umull    v12.8h, v1.8b, %[b0].8b\n"
+                "uadalp    v17.4s, v13.8h\n"
+                "umull    v13.8h, v1.8b, %[b1].8b\n"
+                "uadalp    v18.4s, v14.8h\n"
+                "umull    v14.8h, v1.8b, %[b2].8b\n"
+                "uadalp    v19.4s, v15.8h\n"
+                "umull    v15.8h, v1.8b, %[b3].8b\n"
+
+                "uadalp    v20.4s, v12.8h\n"
+                "umull2    v12.8h, v1.16b, %[b0].16b\n"
+                "uadalp    v21.4s, v13.8h\n"
+                "umull2    v13.8h, v1.16b, %[b1].16b\n"
+                "uadalp    v22.4s, v14.8h\n"
+                "umull2    v14.8h, v1.16b, %[b2].16b\n"
+                "uadalp    v23.4s, v15.8h\n"
+                "umull2    v15.8h, v1.16b, %[b3].16b\n"
+
+                "uadalp    v20.4s, v12.8h\n"
+                "umull    v12.8h, v2.8b, %[b0].8b\n"
+                "uadalp    v21.4s, v13.8h\n"
+                "umull    v13.8h, v2.8b, %[b1].8b\n"
+                "uadalp    v22.4s, v14.8h\n"
+                "umull    v14.8h, v2.8b, %[b2].8b\n"
+                "uadalp    v23.4s, v15.8h\n"
+                "umull    v15.8h, v2.8b, %[b3].8b\n"
+
+                "uadalp    v24.4s, v12.8h\n"
+                "umull2    v12.8h, v2.16b, %[b0].16b\n"
+                "uadalp    v25.4s, v13.8h\n"
+                "umull2    v13.8h, v2.16b, %[b1].16b\n"
+                "uadalp    v26.4s, v14.8h\n"
+                "umull2    v14.8h, v2.16b, %[b2].16b\n"
+                "uadalp    v27.4s, v15.8h\n"
+                "umull2    v15.8h, v2.16b, %[b3].16b\n"
+
+                "uadalp    v24.4s, v12.8h\n"
+                "umull    v12.8h, v3.8b, %[b0].8b\n"
+                "uadalp    v25.4s, v13.8h\n"
+                "umull    v13.8h, v3.8b, %[b1].8b\n"
+                "uadalp    v26.4s, v14.8h\n"
+                "umull    v14.8h, v3.8b, %[b2].8b\n"
+                "uadalp    v27.4s, v15.8h\n"
+                "umull    v15.8h, v3.8b, %[b3].8b\n"
+
+                "uadalp    v28.4s, v12.8h\n"
+                "umull2    v12.8h, v3.16b, %[b0].16b\n"
+                "uadalp    v29.4s, v13.8h\n"
+                "umull2    v13.8h, v3.16b, %[b1].16b\n"
+                "uadalp    v30.4s, v14.8h\n"
+                "umull2    v14.8h, v3.16b, %[b2].16b\n"
+                "uadalp    v31.4s, v15.8h\n"
+                "umull2    v15.8h, v3.16b, %[b3].16b\n"
+
+                "uadalp    v28.4s, v12.8h\n"
+                "uadalp    v29.4s, v13.8h\n"
+                "uadalp    v30.4s, v14.8h\n"
+                "uadalp    v31.4s, v15.8h\n"
+
+                "addp    v16.4s, v16.4s, v17.4s\n"
+                "addp    v17.4s, v18.4s, v19.4s\n"
+                "addp    v18.4s, v20.4s, v21.4s\n"
+                "addp    v19.4s, v22.4s, v23.4s\n"
+                "addp    v20.4s, v24.4s, v25.4s\n"
+                "addp    v21.4s, v26.4s, v27.4s\n"
+                "addp    v22.4s, v28.4s, v29.4s\n"
+                "addp    v23.4s, v30.4s, v31.4s\n"
+
+                "addp    v16.4s, v16.4s, v17.4s\n"
+                "addp    v17.4s, v18.4s, v19.4s\n"
+                "addp    v18.4s, v20.4s, v21.4s\n"
+                "addp    v19.4s, v22.4s, v23.4s\n"
+
+                "str    q16, [%[c_ptr]]\n"
+                "str    q17, [%[c_ptr], #16]\n"
+                "str    q18, [%[c_ptr], #32]\n"
+                "str    q19, [%[c_ptr], #48]\n"
+                "add    %[c_ptr], %[c_ptr], #64\n"
+
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [b3] "+w"(b3),
+                [k] "+r"(k)
+                :
+                : "x20", "x21", "v0", "v1", "v2", "v3", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+                "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
new file mode 100644
index 0000000..5fc0a7b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp

@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include "arm_gemm.hpp"
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hgemm_asimd_24x8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_24x8_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+
+// 24x8 HGEMM "strategy" class.  Describes the kernel properties.
+//
+// The generic "gemm_opt" function will instantiate one of these (allowing
+// the constructor to pick a kernel implementation).
+class hgemm_24x8
+{
+public:
+    typedef __fp16 operand_type;
+    typedef __fp16 result_type;
+
+    typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+
+    static const int  A_block      = 1;
+    static const int  A_interleave = 8;
+    static const bool A_transpose  = false;
+
+    static const int  B_block      = 1;
+    static const int  B_interleave = 24;
+    static const bool B_transpose  = true;
+
+    static const int out_width  = 24;
+    static const int out_height = 8;
+    static const int k_unroll   = 1;
+
+    // Default to the generic kernel
+    kern_type kernel = a64_hgemm_asimd_24x8;
+
+    hgemm_24x8(const CPUInfo *ci)
+    {
+        if(ci->get_cpu_model() == CPUModel::A55r1)
+        {
+            kernel = a64_hgemm_asimd_24x8_a55r1;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
new file mode 100644
index 0000000..2186117
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp

@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Build on AArch64 where either FP16_KERNELS is set or FP16 is explicitly supported.
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K)
+{
+    const __fp16 *a_ptr = Apanel;
+    __fp16       *c_ptr = Cpanel;
+
+    // Fix up for odd lengths - set a flag if K is odd, but make
+    // sure we round up the iteration count.
+    int oddk    = (K & 1);
+    int k_iters = ((K + 1) / 2) - 1;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const __fp16 *a_ptr0 = a_ptr;
+        const __fp16 *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            int k = k_iters;
+            a_ptr = a_ptr0;
+
+            // As A55 requires 64-bit loads anyway, just use 64 bits of the
+            // "A" operands to save on "ins" instructions.  Since A55 is
+            // in-order, two sets of "A" operands and one set of "B" is
+            // sufficient.
+            register float16x8_t a0 asm("v0");
+            register float16x8_t a1 asm("v1");
+            register float16x8_t a0a asm("v2");
+            register float16x8_t a1a asm("v3");
+            register float16x8_t b0 asm("v4");
+            register float16x8_t b1 asm("v5");
+            register float16x8_t b2 asm("v6");
+
+            __asm __volatile(
+                // Enable FP16 extensions
+                ".arch    armv8.2-a+fp16\n"
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi    v8.8h, #0x0\n"
+                "ldr    %d[a0], [%[a_ptr]]\n"
+                "movi    v9.8h, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v10.8h, #0x0\n"
+                "ldr    %d[a1], [%[a_ptr], #8]\n"
+                "movi    v11.8h, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v12.8h, #0x0\n"
+                "movi    v13.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi    v14.8h, #0x0\n"
+                "movi    v15.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi    v16.8h, #0x0\n"
+                "movi    v17.8h, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi    v18.8h, #0x0\n"
+                "movi    v19.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi    v20.8h, #0x0\n"
+                "movi    v21.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi    v22.8h, #0x0\n"
+                "movi    v23.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi    v24.8h, #0x0\n"
+                "movi    v25.8h, #0x0\n"
+                "movi    v26.8h, #0x0\n"
+                "movi    v27.8h, #0x0\n"
+                "movi    v28.8h, #0x0\n"
+                "movi    v29.8h, #0x0\n"
+                "movi    v30.8h, #0x0\n"
+                "movi    v31.8h, #0x0\n"
+
+                // The loop is offset by these two instructions which must
+                // always be executed.
+                "fmla     v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                "1:\n"
+                "fmla      v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "ldr    %d[a0a], [%[a_ptr], #16]\n"
+
+                "fmla     v12.8h, %[b0].8h, %[a1].h[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.8h, %[b0].8h, %[a1].h[1]\n"
+                "fmla    v14.8h, %[b0].8h, %[a1].h[2]\n"
+                "fmla    v15.8h, %[b0].8h, %[a1].h[3]\n"
+                "ldr    %d[a1a], [%[a_ptr], #24]\n"
+
+                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n"
+                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+
+                "fmla    v20.8h, %[b1].8h, %[a1].h[0]\n"
+                "fmla    v21.8h, %[b1].8h, %[a1].h[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "fmla    v22.8h, %[b1].8h, %[a1].h[2]\n"
+                "fmla    v23.8h, %[b1].8h, %[a1].h[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+
+                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCH("[%[a_ptr], #128]")
+
+                "fmla    v28.8h, %[b2].8h, %[a1].h[0]\n"
+                "fmla    v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCH("[%[b_ptr], #384]")
+                "fmla    v30.8h, %[b2].8h, %[a1].h[2]\n"
+                "fmla    v31.8h, %[b2].8h, %[a1].h[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+
+                // Unroll 1
+                "fmla     v8.8h , %[b0].8h, %[a0a].h[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v9.8h , %[b0].8h, %[a0a].h[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "fmla    v10.8h, %[b0].8h, %[a0a].h[2]\n"
+                "fmla    v11.8h, %[b0].8h, %[a0a].h[3]\n"
+                "ldr    %d[a0], [%[a_ptr], #32]\n"
+
+                "fmla     v12.8h, %[b0].8h, %[a1a].h[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.8h, %[b0].8h, %[a1a].h[1]\n"
+                "fmla    v14.8h, %[b0].8h, %[a1a].h[2]\n"
+                "fmla    v15.8h, %[b0].8h, %[a1a].h[3]\n"
+                "ldr    %d[a1], [%[a_ptr], #40]\n"
+
+                "fmla    v16.8h, %[b1].8h, %[a0a].h[0]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "fmla    v17.8h, %[b1].8h, %[a0a].h[1]\n"
+                "fmla    v18.8h, %[b1].8h, %[a0a].h[2]\n"
+                "fmla    v19.8h, %[b1].8h, %[a0a].h[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #96]\n"
+
+                "fmla    v20.8h, %[b1].8h, %[a1a].h[0]\n"
+                "fmla    v21.8h, %[b1].8h, %[a1a].h[1]\n"
+                "ldr    x20, [%[b_ptr], #104]\n"
+                "fmla    v22.8h, %[b1].8h, %[a1a].h[2]\n"
+                "fmla    v23.8h, %[b1].8h, %[a1a].h[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #112]\n"
+
+                "fmla    v24.8h, %[b2].8h, %[a0a].h[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v25.8h, %[b2].8h, %[a0a].h[1]\n"
+                "ldr    x20, [%[b_ptr], #120]\n"
+                "fmla    v26.8h, %[b2].8h, %[a0a].h[2]\n"
+                "fmla    v27.8h, %[b2].8h, %[a0a].h[3]\n"
+
+                "fmla    v28.8h, %[b2].8h, %[a1a].h[0]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "fmla    v29.8h, %[b2].8h, %[a1a].h[1]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v30.8h, %[b2].8h, %[a1a].h[2]\n"
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v31.8h, %[b2].8h, %[a1a].h[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+
+                "fmla     v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "bne    1b\n"
+
+                "4:\n"
+
+                // Start final iteration - branch off to "odd" code before we load a0a
+                "fmla      v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "cbnz    %w[oddk], 2f\n"
+
+                // Even K continuation
+                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "ldr    %d[a0a], [%[a_ptr], #16]\n"
+
+                "fmla     v12.8h, %[b0].8h, %[a1].h[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.8h, %[b0].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr]]")
+                "fmla    v14.8h, %[b0].8h, %[a1].h[2]\n"
+                "fmla    v15.8h, %[b0].8h, %[a1].h[3]\n"
+                "ldr    %d[a1a], [%[a_ptr], #24]\n"
+
+                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+
+                "fmla    v20.8h, %[b1].8h, %[a1].h[0]\n"
+                "fmla    v21.8h, %[b1].8h, %[a1].h[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "fmla    v22.8h, %[b1].8h, %[a1].h[2]\n"
+                "fmla    v23.8h, %[b1].8h, %[a1].h[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+
+                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+
+                "fmla    v28.8h, %[b2].8h, %[a1].h[0]\n"
+                "fmla    v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "fmla    v30.8h, %[b2].8h, %[a1].h[2]\n"
+                "fmla    v31.8h, %[b2].8h, %[a1].h[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+
+                "fmla     v8.8h , %[b0].8h, %[a0a].h[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v9.8h , %[b0].8h, %[a0a].h[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "fmla    v10.8h, %[b0].8h, %[a0a].h[2]\n"
+                "fmla    v11.8h, %[b0].8h, %[a0a].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+
+                "fmla     v12.8h, %[b0].8h, %[a1a].h[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.8h, %[b0].8h, %[a1a].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "fmla    v14.8h, %[b0].8h, %[a1a].h[2]\n"
+                "fmla    v15.8h, %[b0].8h, %[a1a].h[3]\n"
+                "ldr    %d[a1], [%[a_ptr], #40]\n"
+
+                "fmla    v16.8h, %[b1].8h, %[a0a].h[0]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "fmla    v17.8h, %[b1].8h, %[a0a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "fmla    v18.8h, %[b1].8h, %[a0a].h[2]\n"
+                "fmla    v19.8h, %[b1].8h, %[a0a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+
+                "fmla    v20.8h, %[b1].8h, %[a1a].h[0]\n"
+                "fmla    v21.8h, %[b1].8h, %[a1a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+                "fmla    v22.8h, %[b1].8h, %[a1a].h[2]\n"
+                "fmla    v23.8h, %[b1].8h, %[a1a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+
+                "fmla    v24.8h, %[b2].8h, %[a0a].h[0]\n"
+                "fmla    v25.8h, %[b2].8h, %[a0a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+                "fmla    v26.8h, %[b2].8h, %[a0a].h[2]\n"
+                "fmla    v27.8h, %[b2].8h, %[a0a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+
+                "fmla    v28.8h, %[b2].8h, %[a1a].h[0]\n"
+                "fmla    v29.8h, %[b2].8h, %[a1a].h[1]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v30.8h, %[b2].8h, %[a1a].h[2]\n"
+                "fmla    v31.8h, %[b2].8h, %[a1a].h[3]\n"
+                "b    3f\n"
+
+                "2:\n"
+
+                // Odd tail
+                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+
+                "fmla     v12.8h, %[b0].8h, %[a1].h[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.8h, %[b0].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "fmla    v14.8h, %[b0].8h, %[a1].h[2]\n"
+                "add    %[a_ptr], %[a_ptr], #16\n"
+                "fmla    v15.8h, %[b0].8h, %[a1].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+
+                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+
+                "fmla    v20.8h, %[b1].8h, %[a1].h[0]\n"
+                "fmla    v21.8h, %[b1].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "fmla    v22.8h, %[b1].8h, %[a1].h[2]\n"
+                "fmla    v23.8h, %[b1].8h, %[a1].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+
+                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+
+                "fmla    v28.8h, %[b2].8h, %[a1].h[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "fmla    v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "fmla    v30.8h, %[b2].8h, %[a1].h[2]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #640]") "fmla    v31.8h, %[b2].8h, %[a1].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+
+                // Common tail
+                // A55 won't dual issue these stores with anything else, so
+                // simplest to do them all in this common code.
+                "3:\n"
+                "str    q8,  [%[c_ptr]]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+                "str    q24, [%[c_ptr], #32]\n"
+                "str    q9,  [%[c_ptr], #48]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+                "str    q23, [%[c_ptr], #352]\n"
+                "str    q31, [%[c_ptr], #368]\n"
+                "5:\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "=w"(a0), [a0a] "=w"(a0a), [a1] "=w"(a1), [a1a] "=w"(a1a),
+                [b0] "=w"(b0), [b1] "=w"(b1), [b2] "=w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
new file mode 100644
index 0000000..65a5d43
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp

@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Build on AArch64 where either FP16_KERNELS is set or FP16 is explicitly supported.
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K)
+{
+    const __fp16 *a_ptr = Apanel;
+    __fp16       *c_ptr = Cpanel;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const __fp16 *a_ptr0 = a_ptr;
+        const __fp16 *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k    = ((K + 1) / 2) - 1;
+
+            register float16x8_t a0 asm("v0");
+            register float16x8_t a0a asm("v1");
+            register float16x8_t b0 asm("v2");
+            register float16x8_t b1 asm("v3");
+            register float16x8_t b2 asm("v4");
+            register float16x8_t b0a asm("v5");
+            register float16x8_t b1a asm("v6");
+            register float16x8_t b2a asm("v7");
+
+            __asm __volatile(
+                ".arch    armv8.2-a+fp16\n"
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi    v8.8h, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi    v9.8h, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v10.8h, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v11.8h, #0x0\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "movi    v12.8h, #0x0\n"
+                "ldr    %q[b0a], [%[b_ptr], #48]\n"
+                "movi    v13.8h, #0x0\n"
+                "ldr    %q[b1a], [%[b_ptr], #64]\n"
+                "movi    v14.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v15.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v16.8h, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v17.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]") "movi    v18.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v19.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi    v20.8h, #0x0\n"
+                "movi    v21.8h, #0x0\n"
+                "movi    v22.8h, #0x0\n"
+                "movi    v23.8h, #0x0\n"
+                "movi    v24.8h, #0x0\n"
+                "movi    v25.8h, #0x0\n"
+                "movi    v26.8h, #0x0\n"
+                "movi    v27.8h, #0x0\n"
+                "movi    v28.8h, #0x0\n"
+                "movi    v29.8h, #0x0\n"
+                "movi    v30.8h, #0x0\n"
+                "movi    v31.8h, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                "1:\n"
+                "fmla     v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "fmla      v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "ldr    %q[a0a], [%[a_ptr], #16]\n"
+                "fmla    v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "ldr    %q[b2a], [%[b_ptr], #80]\n"
+                "fmla     v12.8h, %[b0].8h, %[a0].h[4]\n"
+                "fmla    v13.8h, %[b0].8h, %[a0].h[5]\n"
+                "fmla    v14.8h, %[b0].8h, %[a0].h[6]\n"
+                "fmla    v15.8h, %[b0].8h, %[a0].h[7]\n"
+                "ldr    %q[b0], [%[b_ptr], #96]\n"
+
+                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCH("[%[a_ptr], #128]")
+                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v20.8h, %[b1].8h, %[a0].h[4]\n"
+                "fmla    v21.8h, %[b1].8h, %[a0].h[5]\n"
+                "fmla    v22.8h, %[b1].8h, %[a0].h[6]\n"
+                "fmla    v23.8h, %[b1].8h, %[a0].h[7]\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+
+                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n" ASM_PREFETCH("[%[b_ptr], #288]")
+                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n"
+                "fmla    v28.8h, %[b2].8h, %[a0].h[4]\n"
+                "fmla    v29.8h, %[b2].8h, %[a0].h[5]\n"
+                "fmla    v30.8h, %[b2].8h, %[a0].h[6]\n"
+                "fmla    v31.8h, %[b2].8h, %[a0].h[7]\n"
+                "ldr    %q[a0], [%[a_ptr], #32]\n"
+
+                "fmla     v8.8h , %[b0a].8h, %[a0a].h[0]\n"
+                "fmla    v9.8h , %[b0a].8h, %[a0a].h[1]\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "fmla    v10.8h, %[b0a].8h, %[a0a].h[2]\n"
+                "fmla    v11.8h, %[b0a].8h, %[a0a].h[3]\n"
+                "fmla     v12.8h, %[b0a].8h, %[a0a].h[4]\n"
+                "fmla    v13.8h, %[b0a].8h, %[a0a].h[5]\n"
+                "fmla    v14.8h, %[b0a].8h, %[a0a].h[6]\n"
+                "fmla    v15.8h, %[b0a].8h, %[a0a].h[7]\n"
+                "ldr    %q[b0a], [%[b_ptr], #48]\n"
+
+                "fmla    v16.8h, %[b1a].8h, %[a0a].h[0]\n"
+                "fmla    v17.8h, %[b1a].8h, %[a0a].h[1]\n" ASM_PREFETCH("[%[b_ptr], #352]")
+                "fmla    v18.8h, %[b1a].8h, %[a0a].h[2]\n"
+                "fmla    v19.8h, %[b1a].8h, %[a0a].h[3]\n"
+                "fmla    v20.8h, %[b1a].8h, %[a0a].h[4]\n"
+                "fmla    v21.8h, %[b1a].8h, %[a0a].h[5]\n"
+                "fmla    v22.8h, %[b1a].8h, %[a0a].h[6]\n"
+                "fmla    v23.8h, %[b1a].8h, %[a0a].h[7]\n"
+                "ldr    %q[b1a], [%[b_ptr], #64]\n"
+
+                "fmla    v24.8h, %[b2a].8h, %[a0a].h[0]\n"
+                "fmla    v25.8h, %[b2a].8h, %[a0a].h[1]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "fmla    v26.8h, %[b2a].8h, %[a0a].h[2]\n"
+                "fmla    v27.8h, %[b2a].8h, %[a0a].h[3]\n"
+                "fmla    v28.8h, %[b2a].8h, %[a0a].h[4]\n"
+                "fmla    v29.8h, %[b2a].8h, %[a0a].h[5]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    v30.8h, %[b2a].8h, %[a0a].h[6]\n"
+                "fmla    v31.8h, %[b2a].8h, %[a0a].h[7]\n"
+
+                "bne    1b\n"
+                "4:\n"
+
+                // Jump to odd tail if necessary.
+                "cbnz    %w[oddk], 2f\n"
+
+                // Even tail.
+                "fmla     v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "fmla   v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "ldr    %q[a0a], [%[a_ptr], #16]\n"
+                "fmla    v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "ldr    %q[b2a], [%[b_ptr], #80]\n"
+                "fmla     v12.8h, %[b0].8h, %[a0].h[4]\n"
+                "fmla   v13.8h, %[b0].8h, %[a0].h[5]\n"
+                "fmla    v14.8h, %[b0].8h, %[a0].h[6]\n"
+                "fmla    v15.8h, %[b0].8h, %[a0].h[7]\n"
+
+                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "fmla    v20.8h, %[b1].8h, %[a0].h[4]\n"
+                "fmla    v21.8h, %[b1].8h, %[a0].h[5]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "fmla    v22.8h, %[b1].8h, %[a0].h[6]\n"
+                "fmla    v23.8h, %[b1].8h, %[a0].h[7]\n"
+
+                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n"
+                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n"
+                "fmla    v28.8h, %[b2].8h, %[a0].h[4]\n"
+                "fmla    v29.8h, %[b2].8h, %[a0].h[5]\n"
+                "fmla    v30.8h, %[b2].8h, %[a0].h[6]\n"
+                "fmla    v31.8h, %[b2].8h, %[a0].h[7]\n"
+
+                "fmla     v8.8h , %[b0a].8h, %[a0a].h[0]\n"
+                "fmla    v16.8h, %[b1a].8h, %[a0a].h[0]\n"
+                "str    q8, [%[c_ptr]]\n"
+                "fmla    v24.8h, %[b2a].8h, %[a0a].h[0]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+
+                "fmla      v9.8h , %[b0a].8h, %[a0a].h[1]\n"
+                "str    q24, [%[c_ptr], #32]\n"
+                "fmla    v17.8h, %[b1a].8h, %[a0a].h[1]\n"
+                "str    q9, [%[c_ptr], #48]\n"
+                "fmla    v25.8h, %[b2a].8h, %[a0a].h[1]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+
+                "fmla    v10.8h, %[b0a].8h, %[a0a].h[2]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "fmla    v18.8h, %[b1a].8h, %[a0a].h[2]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+                "fmla    v26.8h, %[b2a].8h, %[a0a].h[2]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+
+                "fmla    v11.8h, %[b0a].8h, %[a0a].h[3]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "fmla    v19.8h, %[b1a].8h, %[a0a].h[3]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+                "fmla    v27.8h, %[b2a].8h, %[a0a].h[3]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+
+                "fmla     v12.8h, %[b0a].8h, %[a0a].h[4]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "fmla    v20.8h, %[b1a].8h, %[a0a].h[4]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+                "fmla    v28.8h, %[b2a].8h, %[a0a].h[4]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+
+                "fmla      v13.8h, %[b0a].8h, %[a0a].h[5]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "fmla    v21.8h, %[b1a].8h, %[a0a].h[5]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+                "fmla    v29.8h, %[b2a].8h, %[a0a].h[5]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+
+                "fmla    v14.8h, %[b0a].8h, %[a0a].h[6]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "fmla    v22.8h, %[b1a].8h, %[a0a].h[6]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+                "fmla    v30.8h, %[b2a].8h, %[a0a].h[6]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+
+                "fmla    v15.8h, %[b0a].8h, %[a0a].h[7]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "fmla    v23.8h, %[b1a].8h, %[a0a].h[7]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+                "fmla    v31.8h, %[b2a].8h, %[a0a].h[7]\n"
+                "b    3f\n"
+
+                // Odd tail
+                "2:\n"
+                "fmla     v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "add    %[a_ptr], %[a_ptr], #16\n"
+                "str    q8, [%[c_ptr]]\n"
+                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+
+                "fmla      v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "str    q24, [%[c_ptr], #32]\n"
+                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n"
+                "str    q9, [%[c_ptr], #48]\n"
+                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+
+                "fmla    v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+
+                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+
+                "fmla     v12.8h, %[b0].8h, %[a0].h[4]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "fmla    v20.8h, %[b1].8h, %[a0].h[4]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+                "fmla    v28.8h, %[b2].8h, %[a0].h[4]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+
+                "fmla      v13.8h, %[b0].8h, %[a0].h[5]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "fmla    v21.8h, %[b1].8h, %[a0].h[5]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+                "fmla    v29.8h, %[b2].8h, %[a0].h[5]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+
+                "fmla    v14.8h, %[b0].8h, %[a0].h[6]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "fmla    v22.8h, %[b1].8h, %[a0].h[6]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+                "fmla    v30.8h, %[b2].8h, %[a0].h[6]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+
+                "fmla    v15.8h, %[b0].8h, %[a0].h[7]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "fmla    v23.8h, %[b1].8h, %[a0].h[7]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+                "fmla    v31.8h, %[b2].8h, %[a0].h[7]\n"
+
+                "3:\n"
+                "str    q23, [%[c_ptr], #352]\n"
+                "str    q31, [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a0a] "+w"(a0a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k),
+                [b0a] "+w"(b0a), [b1a] "+w"(b1a), [b2a] "+w"(b2a)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
new file mode 100644
index 0000000..91a9e8d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp

@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_sgemm_asimd_12x8(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_12x8_a53(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_12x8_a55(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_12x8_a55r1(const float *, const float *, float *, int, int, int);
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics.  The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class sgemm_12x8
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int A_interleave = 8;
+    static const int A_block      = 1;
+    static const int A_transpose  = 0;
+
+    /* Same for B input */
+    static const int B_interleave = 12;
+    static const int B_block      = 1;
+    static const int B_transpose  = 1;
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 12;
+    static const int out_height = 8;
+    static const int k_unroll   = 1;
+
+    kern_type kernel = a64_sgemm_asimd_12x8;
+
+    sgemm_12x8(const CPUInfo *ci)
+    {
+        // Select specific kernel if available
+        switch(ci->get_cpu_model())
+        {
+            case CPUModel::A53:
+                kernel = a64_sgemm_asimd_12x8_a53;
+                break;
+
+            case CPUModel::A55r0:
+                kernel = a64_sgemm_asimd_12x8_a55;
+                break;
+
+            case CPUModel::A55r1:
+                kernel = a64_sgemm_asimd_12x8_a55r1;
+                break;
+
+            default:
+                /* Generic kernel is initialized by default. */
+                break;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp
new file mode 100644
index 0000000..618ebc7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp

@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+    const float *a_ptr = Apanel;
+    float       *c_ptr = Cpanel;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k    = ((K + 1) / 2) - 1;
+
+            register float32x4_t a0 asm("v0");
+            register float32x4_t a1 asm("v1");
+            register float32x4_t b0 asm("v2");
+            register float32x4_t b1 asm("v3");
+            register float32x4_t b2 asm("v4");
+            register float32x4_t a0a asm("v5");
+            register float32x4_t a1a asm("v6");
+
+            __asm __volatile(
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi    v8.4s, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi    v9.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v10.4s, #0x0\n"
+                "ldr    %q[a1], [%[a_ptr], #16]\n"
+                "movi    v11.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]") "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi    v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi    v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi    v22.4s, #0x0\n"
+                "movi    v23.4s, #0x0\n"
+                "movi    v24.4s, #0x0\n"
+                "movi    v25.4s, #0x0\n"
+                "movi    v26.4s, #0x0\n"
+                "movi    v27.4s, #0x0\n"
+                "movi    v28.4s, #0x0\n"
+                "movi    v29.4s, #0x0\n"
+                "movi    v30.4s, #0x0\n"
+                "movi    v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                "1:\n"
+                // Unroll 0
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "nop\n"
+                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                "nop\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "nop\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Unroll 1
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+                "nop\n"
+                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+                "ldr    %d[a0], [%[a_ptr], #64]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "ldr    x20, [%[a_ptr], #72]\n"
+                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
+
+                "ldr    %d[a1], [%[a_ptr], #80]\n"
+                "ins    %[a0].d[1], x20\n"
+                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "ldr    x20, [%[a_ptr], #88]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
+
+                "ldr    %d[b0], [%[b_ptr], #96]\n"
+                "ins    %[a1].d[1], x20\n"
+                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #104]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+
+                "nop\n"
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
+
+                "nop\n"
+                "nop\n"
+                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
+
+                "ldr    %d[b1], [%[b_ptr], #112]\n"
+                "nop\n"
+                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "ldr    x20, [%[b_ptr], #120]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+
+                "nop\n"
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+
+                "bne    1b\n"
+
+                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
+                "4:\n"
+                "cbnz    %w[oddk], 2f\n"
+
+                // Detached final iteration. (even K)
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "nop\n"
+                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+
+                "nop\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "nop\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+                "nop\n"
+                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "b    3f\n"
+
+                // Detached final iteration. (odd K)
+                "2:\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "nop\n"
+                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Common tail
+                "3:\n"
+                "str    q8,  [%[c_ptr]]\n"
+                "str    q16,  [%[c_ptr], #16]\n"
+                "str    q24,  [%[c_ptr], #32]\n"
+                "str    q9,  [%[c_ptr], #48]\n"
+                "str    q17,  [%[c_ptr], #64]\n"
+                "str    q25,  [%[c_ptr], #80]\n"
+                "str    q10,  [%[c_ptr], #96]\n"
+                "str    q18,  [%[c_ptr], #112]\n"
+                "str    q26,  [%[c_ptr], #128]\n"
+                "str    q11,  [%[c_ptr], #144]\n"
+                "str    q19,  [%[c_ptr], #160]\n"
+                "str    q27,  [%[c_ptr], #176]\n"
+                "str    q12,  [%[c_ptr], #192]\n"
+                "str    q20,  [%[c_ptr], #208]\n"
+                "str    q28,  [%[c_ptr], #224]\n"
+                "str    q13,  [%[c_ptr], #240]\n"
+                "str    q21,  [%[c_ptr], #256]\n"
+                "str    q29,  [%[c_ptr], #272]\n"
+                "str    q14,  [%[c_ptr], #288]\n"
+                "str    q22,  [%[c_ptr], #304]\n"
+                "str    q30,  [%[c_ptr], #320]\n"
+                "str    q15,  [%[c_ptr], #336]\n"
+                "str    q23,  [%[c_ptr], #352]\n"
+                "str    q31,  [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp
new file mode 100644
index 0000000..4ca25eb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp

@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+    const float *a_ptr = Apanel;
+    float       *c_ptr = Cpanel;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k    = ((K + 1) / 2) - 1;
+
+            register float32x4_t a0 asm("v0");
+            register float32x4_t a1 asm("v1");
+            register float32x4_t b0 asm("v2");
+            register float32x4_t b1 asm("v3");
+            register float32x4_t b2 asm("v4");
+            register float32x4_t a0a asm("v5");
+            register float32x4_t a1a asm("v6");
+
+            __asm __volatile(
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi    v8.4s, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi    v9.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v10.4s, #0x0\n"
+                "ldr    %q[a1], [%[a_ptr], #16]\n"
+                "movi    v11.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]") "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi    v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi    v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi    v22.4s, #0x0\n"
+                "movi    v23.4s, #0x0\n"
+                "movi    v24.4s, #0x0\n"
+                "movi    v25.4s, #0x0\n"
+                "movi    v26.4s, #0x0\n"
+                "movi    v27.4s, #0x0\n"
+                "movi    v28.4s, #0x0\n"
+                "movi    v29.4s, #0x0\n"
+                "movi    v30.4s, #0x0\n"
+                "movi    v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                "1:\n"
+                // Unroll 0
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+
+                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "subs    %w[k], %w[k], #1\n"
+
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+                "ins    %[b2].d[1], x20\n"
+
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+                "ins    %[a0a].d[1], x20\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+                "ins    %[a1a].d[1], x20\n" ASM_PREFETCH("[%[a_ptr], #320]")
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "ins    %[b0].d[1], x20\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Unroll 1
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+                "ins    %[b1].d[1], x20\n"
+
+                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
+
+                "ldr    %d[a0], [%[a_ptr], #64]\n"
+                "ins    %[b2].d[1], x20\n"
+
+                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "ldr    x20, [%[a_ptr], #72]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
+
+                "ldr    %d[a1], [%[a_ptr], #80]\n"
+                "ins    %[a0].d[1], x20\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[a_ptr], #88]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+
+                "ldr    %d[b0], [%[b_ptr], #96]\n"
+                "ins    %[a1].d[1], x20\n"
+
+                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #104]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
+
+                "ldr    %d[b1], [%[b_ptr], #112]\n"
+                "ins    %[b0].d[1], x20\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #120]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+
+                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "ins    %[b1].d[1], x20\n"
+
+                "bne    1b\n"
+
+                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
+                "4:\n"
+                "cbnz    %w[oddk], 2f\n"
+                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+
+                // Detached final iteration. (even K)
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+                "ins    %[b2].d[1], x20\n"
+
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+                "ins    %[a0a].d[1], x20\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+                "ins    %[a1a].d[1], x20\n"
+
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "ins    %[b0].d[1], x20\n"
+
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+                "ins    %[b1].d[1], x20\n"
+
+                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "b    3f\n"
+
+                // Detached final iteration. (odd K)
+                "2:\n"
+
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+
+                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Common tail
+                "3:\n"
+                "str    q8,  [%[c_ptr]]\n"
+                "str    q16,  [%[c_ptr], #16]\n"
+                "str    q24,  [%[c_ptr], #32]\n"
+                "str    q9,  [%[c_ptr], #48]\n"
+                "str    q17,  [%[c_ptr], #64]\n"
+                "str    q25,  [%[c_ptr], #80]\n"
+                "str    q10,  [%[c_ptr], #96]\n"
+                "str    q18,  [%[c_ptr], #112]\n"
+                "str    q26,  [%[c_ptr], #128]\n"
+                "str    q11,  [%[c_ptr], #144]\n"
+                "str    q19,  [%[c_ptr], #160]\n"
+                "str    q27,  [%[c_ptr], #176]\n"
+                "str    q12,  [%[c_ptr], #192]\n"
+                "str    q20,  [%[c_ptr], #208]\n"
+                "str    q28,  [%[c_ptr], #224]\n"
+                "str    q13,  [%[c_ptr], #240]\n"
+                "str    q21,  [%[c_ptr], #256]\n"
+                "str    q29,  [%[c_ptr], #272]\n"
+                "str    q14,  [%[c_ptr], #288]\n"
+                "str    q22,  [%[c_ptr], #304]\n"
+                "str    q30,  [%[c_ptr], #320]\n"
+                "str    q15,  [%[c_ptr], #336]\n"
+                "str    q23,  [%[c_ptr], #352]\n"
+                "str    q31,  [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp
new file mode 100644
index 0000000..89fe6ac
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp

@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, const int ablocks, const int bblocks, const int K)
+{
+    const float *a_ptr = Apanel;
+    float       *c_ptr = Cpanel;
+
+    // Fix up for odd lengths - set a flag if K is odd, but make
+    // sure we round up the iteration count.
+    int oddk    = (K & 1);
+    int k_iters = ((K + 1) / 2) - 1;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+            int k = k_iters;
+
+            register float32x4_t a0 asm("v0");
+            register float32x4_t a1 asm("v1");
+            register float32x4_t b0 asm("v2");
+            register float32x4_t b1 asm("v3");
+            register float32x4_t b2 asm("v4");
+            register float32x4_t a0a asm("v5");
+            register float32x4_t a1a asm("v6");
+
+            __asm __volatile(
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi   v8.4s, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi   v9.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi   v10.4s, #0x0\n"
+                "ldr    %q[a1], [%[a_ptr], #16]\n"
+                "movi   v11.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi   v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi   v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi   v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi   v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi   v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi   v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi   v18.4s, #0x0\n"
+                "movi   v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi   v20.4s, #0x0\n"
+                "movi   v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi   v22.4s, #0x0\n"
+                "movi   v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi   v24.4s, #0x0\n"
+                "movi   v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi   v26.4s, #0x0\n"
+                "movi   v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "movi   v28.4s, #0x0\n"
+                "movi   v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]")
+                "movi   v30.4s, #0x0\n"
+                "movi   v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]")
+
+                // The loop is offset by these two instructions which must
+                // always be executed.
+                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                "1:\n"
+                // Unroll 0
+                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCH("[%[a_ptr], #448]")
+
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n" ASM_PREFETCH("[%[b_ptr], #576]")
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Unroll 1
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+
+                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "ldr    %d[a0], [%[a_ptr], #64]\n"
+
+                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "ldr    x20, [%[a_ptr], #72]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "ldr    %d[a1], [%[a_ptr], #80]\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "ins    %[a0].d[1], x20\n"
+                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[a_ptr], #88]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #96]\n"
+
+                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "ins    %[a1].d[1], x20\n"
+                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #104]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #112]\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #120]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
+
+                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+
+                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n" ASM_PREFETCH("[%[b_ptr], #640]")
+                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+
+                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "b.ne    1b\n"
+
+                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
+                "4:\n"
+
+                // Start final iteration - branch off to "odd" code before we load a0a.
+                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "cbnz    %w[oddk], 2f\n"
+
+                // Even K continuation
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr]]")
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+
+                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "ins    %[b2].d[1], x20\n"
+
+                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "b    3f\n"
+
+                // Odd K continuation
+                "2:\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #640]") "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Common tail
+                "3:\n"
+                "str    q8,   [%[c_ptr]]\n"
+                "str    q16,  [%[c_ptr], #16]\n"
+                "str    q24,  [%[c_ptr], #32]\n"
+                "str    q9,   [%[c_ptr], #48]\n"
+                "str    q17,  [%[c_ptr], #64]\n"
+                "str    q25,  [%[c_ptr], #80]\n"
+                "str    q10,  [%[c_ptr], #96]\n"
+                "str    q18,  [%[c_ptr], #112]\n"
+                "str    q26,  [%[c_ptr], #128]\n"
+                "str    q11,  [%[c_ptr], #144]\n"
+                "str    q19,  [%[c_ptr], #160]\n"
+                "str    q27,  [%[c_ptr], #176]\n"
+                "str    q12,  [%[c_ptr], #192]\n"
+                "str    q20,  [%[c_ptr], #208]\n"
+                "str    q28,  [%[c_ptr], #224]\n"
+                "str    q13,  [%[c_ptr], #240]\n"
+                "str    q21,  [%[c_ptr], #256]\n"
+                "str    q29,  [%[c_ptr], #272]\n"
+                "str    q14,  [%[c_ptr], #288]\n"
+                "str    q22,  [%[c_ptr], #304]\n"
+                "str    q30,  [%[c_ptr], #320]\n"
+                "str    q15,  [%[c_ptr], #336]\n"
+                "str    q23,  [%[c_ptr], #352]\n"
+                "str    q31,  [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp
new file mode 100644
index 0000000..42e870e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp

@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K, long int row_jump = 0, long int block_jump = 0)
+{
+    const float *a_ptr = Apanel;
+    float       *c_ptr = Cpanel;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k    = ((K + 1) / 2) - 1;
+
+            register float32x4_t a0 asm("v0");
+            register float32x4_t a1 asm("v1");
+            register float32x4_t b0 asm("v2");
+            register float32x4_t b1 asm("v3");
+            register float32x4_t b2 asm("v4");
+            register float32x4_t a0a asm("v5");
+            register float32x4_t a1a asm("v6");
+
+            __asm __volatile(
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi    v8.4s, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi    v9.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v10.4s, #0x0\n"
+                "ldr    %q[a1], [%[a_ptr], #16]\n"
+                "movi    v11.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]") "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi    v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi    v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi    v22.4s, #0x0\n"
+                "movi    v23.4s, #0x0\n"
+                "movi    v24.4s, #0x0\n"
+                "movi    v25.4s, #0x0\n"
+                "movi    v26.4s, #0x0\n"
+                "movi    v27.4s, #0x0\n"
+                "movi    v28.4s, #0x0\n"
+                "movi    v29.4s, #0x0\n"
+                "movi    v30.4s, #0x0\n"
+                "movi    v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                // Loop proper
+                "1:\n"
+                "fmla     v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "fmla      v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr    %q[a0a], [%[a_ptr], #32]\n"
+                "fmla     v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr    %q[a1a], [%[a_ptr], #48]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #48]\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #64]\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ldr    %q[b2], [%[b_ptr], #80]\n"
+
+                "fmla     v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr    %q[a0], [%[a_ptr], #64]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "fmla     v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "ldr    %q[a1], [%[a_ptr], #80]\n"
+                "fmla   v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #96]\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
+                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #112]\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "bne    1b\n"
+
+                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz    %w[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                "fmla     v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "fmla   v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr    %q[a0a], [%[a_ptr], #32]\n"
+                "fmla     v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr    %q[a1a], [%[a_ptr], #48]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #48]\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #64]\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ldr    %q[b2], [%[b_ptr], #80]\n"
+
+                "fmla     v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "add    %[b_ptr], %[b_ptr], %[block_jump]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla   v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "add    %[b_ptr], %[b_ptr], %[row_jump]\n"
+                "str    q8, [%[c_ptr], #0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "str    q24, [%[c_ptr], #32]\n"
+
+                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "str    q9, [%[c_ptr], #48]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+
+                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+
+                "fmla     v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+
+                "fmla   v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+
+                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+
+                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+
+                "b    3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                "fmla     v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "add    %[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla   v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "str    q8, [%[c_ptr], #0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "str    q24, [%[c_ptr], #32]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "str    q9, [%[c_ptr], #48]\n"
+
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+
+                "fmla     v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+
+                "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+
+                // Common tail
+                "3:\n"
+                "str    q23, [%[c_ptr], #352]\n"
+                "str    q31, [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk), [row_jump] "r"(row_jump), [block_jump] "r"(block_jump)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+    a64_sgemm_asimd_12x8_jumps(Apanel, Bpanel, Cpanel, ablocks, bblocks, K, 0, 0);
+}
+
+} // namespace arm_gemm
+
+#endif
\ No newline at end of file

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp
new file mode 100644
index 0000000..eceacc9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp

@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_sgemm_native_16x4(const float *, int, const float *, int, float *, int, float, int, int, int);
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics.  The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class sgemm_native_16x4
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, int, const float *, int, float *, int, float, int, int, int);
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 16;
+    static const int out_height = 4;
+    static const int k_unroll   = 1;
+
+    // Default to the generic kernel
+    kern_type kernel = a64_sgemm_native_16x4;
+
+    sgemm_native_16x4(const CPUInfo *ci)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp
new file mode 100644
index 0000000..8d4a38c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp

@@ -0,0 +1,869 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+
+#include <arm_neon.h>
+
+namespace arm_gemm
+{
+void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K)
+{
+    const int oddk    = ((K % 8) >= 4) ? 1 : 0;
+    const int beta0   = (beta == 0.0f) ? 1 : 0;
+    const int oddones = (K % 4);
+
+    /* For now, very naive with no blocking */
+    for(int y = 0; y < M; y += 4)
+    {
+        for(int x0 = 0; x0 < N; x0 += 16)
+        {
+            const float *a_ptr0 = A + (y * lda);
+            const float *a_ptr1 = a_ptr0 + lda;
+            const float *a_ptr2 = a_ptr1 + lda;
+            const float *a_ptr3 = a_ptr2 + lda;
+
+            const float *b_ptr = B + x0;
+
+            float *c_ptr0 = C + (y * ldc) + x0;
+            float *c_ptr1 = c_ptr0 + ldc;
+            float *c_ptr2 = c_ptr1 + ldc;
+            float *c_ptr3 = c_ptr2 + ldc;
+
+            int loops = ((K + 4) / 8) - 1;
+            int odds  = oddones;
+
+            size_t ldbb = ldb * sizeof(float);
+
+            __asm __volatile(
+                "a0   .req v0\n"
+                "a1   .req v1\n"
+                "a2   .req v2\n"
+                "a3   .req v3\n"
+                "a0a  .req v4\n"
+                "a1a  .req v5\n"
+                "a2a  .req v6\n"
+                "a3a  .req v7\n"
+                "bb0  .req v8\n"
+                "bb1  .req v9\n"
+                "bb2  .req v10\n"
+                "bb3  .req v11\n"
+                "b0a  .req v12\n"
+                "b1a  .req v13\n"
+                "b2a  .req v14\n"
+                "b3a  .req v15\n"
+
+                "a0q  .req q0\n"
+                "a1q  .req q1\n"
+                "a2q  .req q2\n"
+                "a3q  .req q3\n"
+                "a0aq .req q4\n"
+                "a1aq .req q5\n"
+                "a2aq .req q6\n"
+                "a3aq .req q7\n"
+                "b0q  .req q8\n"
+                "b1q  .req q9\n"
+                "b2q  .req q10\n"
+                "b3q  .req q11\n"
+                "b0aq .req q12\n"
+                "b1aq .req q13\n"
+                "b2aq .req q14\n"
+                "b3aq .req q15\n"
+
+                "movi    v16.4s, #0x0\n"
+                "ldr    a0q, [%[a_ptr0]]\n"
+                "movi    v17.4s, #0x0\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+                "movi    v18.4s, #0x0\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+                "movi    v19.4s, #0x0\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+                "movi    v20.4s, #0x0\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+                "movi    v21.4s, #0x0\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "ldr    a1q, [%[a_ptr1]]\n"
+                "movi    v22.4s, #0x0\n"
+                "ldr    a2q, [%[a_ptr2]]\n"
+                "movi    v23.4s, #0x0\n"
+                "ldr    a3q, [%[a_ptr3]]\n"
+                "movi    v24.4s, #0x0\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+                "movi    v25.4s, #0x0\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+                "movi    v26.4s, #0x0\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+                "cbz    %w[beta0], 5f\n"
+                "movi    v27.4s, #0x0\n"
+                "movi    v28.4s, #0x0\n"
+                "movi    v29.4s, #0x0\n"
+                "movi    v30.4s, #0x0\n"
+                "movi    v31.4s, #0x0\n"
+
+                // Skip if no complete loops.
+                "cbz    %w[loops], 4f\n"
+                "b    1f\n"
+
+                // If beta is non-zero, need to load and multiply by beta
+                "5:\n"
+                "ld1r    {v4.4s}, [%[betaptr]]\n"
+                "ldr    q16, [%[c_ptr0]]\n"
+                "ldr    q17, [%[c_ptr0], #16]\n"
+                "ldr    q18, [%[c_ptr0], #32]\n"
+                "ldr    q19, [%[c_ptr0], #48]\n"
+
+                "ldr    q20, [%[c_ptr1]]\n"
+                "fmul    v16.4s, v16.4s, v4.4s\n"
+                "ldr    q21, [%[c_ptr1], #16]\n"
+                "fmul    v17.4s, v17.4s, v4.4s\n"
+                "ldr    q22, [%[c_ptr1], #32]\n"
+                "fmul    v18.4s, v18.4s, v4.4s\n"
+                "ldr    q23, [%[c_ptr1], #48]\n"
+                "fmul    v19.4s, v19.4s, v4.4s\n"
+
+                "ldr    q24, [%[c_ptr2]]\n"
+                "fmul    v20.4s, v20.4s, v4.4s\n"
+                "ldr    q25, [%[c_ptr2], #16]\n"
+                "fmul    v21.4s, v21.4s, v4.4s\n"
+                "ldr    q26, [%[c_ptr2], #32]\n"
+                "fmul    v22.4s, v22.4s, v4.4s\n"
+                "ldr    q27, [%[c_ptr2], #48]\n"
+                "fmul    v23.4s, v23.4s, v4.4s\n"
+
+                "ldr    q28, [%[c_ptr3]]\n"
+                "fmul    v24.4s, v24.4s, v4.4s\n"
+                "ldr    q29, [%[c_ptr3], #16]\n"
+                "fmul    v25.4s, v25.4s, v4.4s\n"
+                "ldr    q30, [%[c_ptr3], #32]\n"
+                "fmul    v26.4s, v26.4s, v4.4s\n"
+                "ldr    q31, [%[c_ptr3], #48]\n"
+                "fmul    v27.4s, v27.4s, v4.4s\n"
+
+                "fmul    v28.4s, v28.4s, v4.4s\n"
+                "fmul    v29.4s, v29.4s, v4.4s\n"
+                "fmul    v30.4s, v30.4s, v4.4s\n"
+                "fmul    v31.4s, v31.4s, v4.4s\n"
+
+                "cbz    %w[loops], 4f\n"
+
+                "1:\n"
+                // Unroll 0
+                "fmla    v16.4s, bb0.4s, a0.s[0]\n"
+                "fmla    v20.4s, bb0.4s, a1.s[0]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+                "fmla    v24.4s, bb0.4s, a2.s[0]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v28.4s, bb0.4s, a3.s[0]\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, bb1.4s, a0.s[0]\n"
+                "fmla    v21.4s, bb1.4s, a1.s[0]\n"
+                "ldr    a0aq, [%[a_ptr0], #16]\n"
+                "fmla    v25.4s, bb1.4s, a2.s[0]\n"
+                "fmla    v29.4s, bb1.4s, a3.s[0]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, bb2.4s, a0.s[0]\n"
+                "fmla    v22.4s, bb2.4s, a1.s[0]\n"
+                "ldr    a1aq, [%[a_ptr1], #16]\n"
+                "fmla    v26.4s, bb2.4s, a2.s[0]\n"
+                "fmla    v30.4s, bb2.4s, a3.s[0]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, bb3.4s, a0.s[0]\n"
+                "fmla    v23.4s, bb3.4s, a1.s[0]\n"
+                "ldr    a2aq, [%[a_ptr2], #16]\n"
+                "fmla    v27.4s, bb3.4s, a2.s[0]\n"
+                "fmla    v31.4s, bb3.4s, a3.s[0]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                // Unroll 1
+                "fmla    v16.4s, b0a.4s, a0.s[1]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v20.4s, b0a.4s, a1.s[1]\n"
+                "ldr    a3aq, [%[a_ptr3], #16]\n"
+                "fmla    v24.4s, b0a.4s, a2.s[1]\n"
+                "fmla    v28.4s, b0a.4s, a3.s[1]\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, b1a.4s, a0.s[1]\n"
+                "fmla    v21.4s, b1a.4s, a1.s[1]\n"
+                "subs    %w[loops], %w[loops], #1\n"
+                "fmla    v25.4s, b1a.4s, a2.s[1]\n"
+                "fmla    v29.4s, b1a.4s, a3.s[1]\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, b2a.4s, a0.s[1]\n"
+                "fmla    v22.4s, b2a.4s, a1.s[1]\n"
+                "fmla    v26.4s, b2a.4s, a2.s[1]\n"
+                "fmla    v30.4s, b2a.4s, a3.s[1]\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, b3a.4s, a0.s[1]\n"
+                "fmla    v23.4s, b3a.4s, a1.s[1]\n"
+                "fmla    v27.4s, b3a.4s, a2.s[1]\n"
+                "fmla    v31.4s, b3a.4s, a3.s[1]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+
+                // Unroll 2
+                "fmla    v16.4s, bb0.4s, a0.s[2]\n"
+                "fmla    v20.4s, bb0.4s, a1.s[2]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, bb0.4s, a2.s[2]\n"
+                "fmla    v28.4s, bb0.4s, a3.s[2]\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, bb1.4s, a0.s[2]\n"
+                "add    %[a_ptr0], %[a_ptr0], #32\n"
+                "fmla    v21.4s, bb1.4s, a1.s[2]\n"
+                "add    %[a_ptr1], %[a_ptr1], #32\n"
+                "fmla    v25.4s, bb1.4s, a2.s[2]\n"
+                "add    %[a_ptr2], %[a_ptr2], #32\n"
+                "fmla    v29.4s, bb1.4s, a3.s[2]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, bb2.4s, a0.s[2]\n"
+                "add    %[a_ptr3], %[a_ptr3], #32\n"
+                "fmla    v22.4s, bb2.4s, a1.s[2]\n"
+                "fmla    v26.4s, bb2.4s, a2.s[2]\n"
+                "fmla    v30.4s, bb2.4s, a3.s[2]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, bb3.4s, a0.s[2]\n"
+                "fmla    v23.4s, bb3.4s, a1.s[2]\n"
+                "fmla    v27.4s, bb3.4s, a2.s[2]\n"
+                "fmla    v31.4s, bb3.4s, a3.s[2]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                // Unroll 3
+                "fmla    v16.4s, b0a.4s, a0.s[3]\n"
+                "fmla    v20.4s, b0a.4s, a1.s[3]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, b0a.4s, a2.s[3]\n"
+                "fmla    v28.4s, b0a.4s, a3.s[3]\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, b1a.4s, a0.s[3]\n"
+                "fmla    v21.4s, b1a.4s, a1.s[3]\n"
+                "fmla    v25.4s, b1a.4s, a2.s[3]\n"
+                "fmla    v29.4s, b1a.4s, a3.s[3]\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, b2a.4s, a0.s[3]\n"
+                "fmla    v22.4s, b2a.4s, a1.s[3]\n"
+                "fmla    v26.4s, b2a.4s, a2.s[3]\n"
+                "fmla    v30.4s, b2a.4s, a3.s[3]\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, b3a.4s, a0.s[3]\n"
+                "fmla    v23.4s, b3a.4s, a1.s[3]\n"
+                "ldr    a0q, [%[a_ptr0]]\n"
+                "fmla    v27.4s, b3a.4s, a2.s[3]\n"
+                "fmla    v31.4s, b3a.4s, a3.s[3]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+
+                // Unroll 4
+                "fmla    v16.4s, bb0.4s, a0a.s[0]\n"
+                "fmla    v20.4s, bb0.4s, a1a.s[0]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, bb0.4s, a2a.s[0]\n"
+                "fmla    v28.4s, bb0.4s, a3a.s[0]\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, bb1.4s, a0a.s[0]\n"
+                "fmla    v21.4s, bb1.4s, a1a.s[0]\n"
+                "ldr    a1q, [%[a_ptr1]]\n"
+                "fmla    v25.4s, bb1.4s, a2a.s[0]\n"
+                "fmla    v29.4s, bb1.4s, a3a.s[0]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, bb2.4s, a0a.s[0]\n"
+                "fmla    v22.4s, bb2.4s, a1a.s[0]\n"
+                "ldr    a2q, [%[a_ptr2]]\n"
+                "fmla    v26.4s, bb2.4s, a2a.s[0]\n"
+                "fmla    v30.4s, bb2.4s, a3a.s[0]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, bb3.4s, a0a.s[0]\n"
+                "fmla    v23.4s, bb3.4s, a1a.s[0]\n"
+                "ldr    a3q, [%[a_ptr3]]\n"
+                "fmla    v27.4s, bb3.4s, a2a.s[0]\n"
+                "fmla    v31.4s, bb3.4s, a3a.s[0]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                // Unroll 5
+                "fmla    v16.4s, b0a.4s, a0a.s[1]\n"
+                "fmla    v20.4s, b0a.4s, a1a.s[1]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, b0a.4s, a2a.s[1]\n"
+                "fmla    v28.4s, b0a.4s, a3a.s[1]\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, b1a.4s, a0a.s[1]\n"
+                "fmla    v21.4s, b1a.4s, a1a.s[1]\n"
+                "fmla    v25.4s, b1a.4s, a2a.s[1]\n"
+                "fmla    v29.4s, b1a.4s, a3a.s[1]\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, b2a.4s, a0a.s[1]\n"
+                "fmla    v22.4s, b2a.4s, a1a.s[1]\n"
+                "fmla    v26.4s, b2a.4s, a2a.s[1]\n"
+                "fmla    v30.4s, b2a.4s, a3a.s[1]\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, b3a.4s, a0a.s[1]\n"
+                "fmla    v23.4s, b3a.4s, a1a.s[1]\n"
+                "fmla    v27.4s, b3a.4s, a2a.s[1]\n"
+                "fmla    v31.4s, b3a.4s, a3a.s[1]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+
+                // Unroll 6
+                "fmla    v16.4s, bb0.4s, a0a.s[2]\n"
+                "fmla    v20.4s, bb0.4s, a1a.s[2]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, bb0.4s, a2a.s[2]\n"
+                "fmla    v28.4s, bb0.4s, a3a.s[2]\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, bb1.4s, a0a.s[2]\n"
+                "fmla    v21.4s, bb1.4s, a1a.s[2]\n"
+                "fmla    v25.4s, bb1.4s, a2a.s[2]\n"
+                "fmla    v29.4s, bb1.4s, a3a.s[2]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, bb2.4s, a0a.s[2]\n"
+                "fmla    v22.4s, bb2.4s, a1a.s[2]\n"
+                "fmla    v26.4s, bb2.4s, a2a.s[2]\n"
+                "fmla    v30.4s, bb2.4s, a3a.s[2]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, bb3.4s, a0a.s[2]\n"
+                "fmla    v23.4s, bb3.4s, a1a.s[2]\n"
+                "fmla    v27.4s, bb3.4s, a2a.s[2]\n"
+                "fmla    v31.4s, bb3.4s, a3a.s[2]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                // Unroll 7
+                "fmla    v16.4s, b0a.4s, a0a.s[3]\n"
+                "fmla    v20.4s, b0a.4s, a1a.s[3]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, b0a.4s, a2a.s[3]\n"
+                "fmla    v28.4s, b0a.4s, a3a.s[3]\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, b1a.4s, a0a.s[3]\n"
+                "fmla    v21.4s, b1a.4s, a1a.s[3]\n"
+                "fmla    v25.4s, b1a.4s, a2a.s[3]\n"
+                "fmla    v29.4s, b1a.4s, a3a.s[3]\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, b2a.4s, a0a.s[3]\n"
+                "fmla    v22.4s, b2a.4s, a1a.s[3]\n"
+                "fmla    v26.4s, b2a.4s, a2a.s[3]\n"
+                "fmla    v30.4s, b2a.4s, a3a.s[3]\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, b3a.4s, a0a.s[3]\n"
+                "fmla    v23.4s, b3a.4s, a1a.s[3]\n"
+                "fmla    v27.4s, b3a.4s, a2a.s[3]\n"
+                "fmla    v31.4s, b3a.4s, a3a.s[3]\n"
+                "bne    1b\n"
+
+                // Skip to here
+                "4:\n"
+
+                // Detached final iteration
+                // Unroll 0
+                "fmla    v16.4s, bb0.4s, a0.s[0]\n"
+                "fmla    v20.4s, bb0.4s, a1.s[0]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+                "fmla    v24.4s, bb0.4s, a2.s[0]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v28.4s, bb0.4s, a3.s[0]\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, bb1.4s, a0.s[0]\n"
+                "cbnz    %w[oddk], 2f\n" // Deal with odd K before we load a0a
+                "fmla    v21.4s, bb1.4s, a1.s[0]\n"
+                "ldr    a0aq, [%[a_ptr0], #16]\n"
+                "fmla    v25.4s, bb1.4s, a2.s[0]\n"
+                "fmla    v29.4s, bb1.4s, a3.s[0]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, bb2.4s, a0.s[0]\n"
+                "fmla    v22.4s, bb2.4s, a1.s[0]\n"
+                "ldr    a1aq, [%[a_ptr1], #16]\n"
+                "fmla    v26.4s, bb2.4s, a2.s[0]\n"
+                "fmla    v30.4s, bb2.4s, a3.s[0]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, bb3.4s, a0.s[0]\n"
+                "fmla    v23.4s, bb3.4s, a1.s[0]\n"
+                "ldr    a2aq, [%[a_ptr2], #16]\n"
+                "fmla    v27.4s, bb3.4s, a2.s[0]\n"
+                "fmla    v31.4s, bb3.4s, a3.s[0]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                // Unroll 1
+                "fmla    v16.4s, b0a.4s, a0.s[1]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v20.4s, b0a.4s, a1.s[1]\n"
+                "ldr    a3aq, [%[a_ptr3], #16]\n"
+                "fmla    v24.4s, b0a.4s, a2.s[1]\n"
+                "fmla    v28.4s, b0a.4s, a3.s[1]\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, b1a.4s, a0.s[1]\n"
+                "add    %[a_ptr0], %[a_ptr0], #32\n"
+                "fmla    v21.4s, b1a.4s, a1.s[1]\n"
+                "add    %[a_ptr1], %[a_ptr1], #32\n"
+                "fmla    v25.4s, b1a.4s, a2.s[1]\n"
+                "add    %[a_ptr2], %[a_ptr2], #32\n"
+                "fmla    v29.4s, b1a.4s, a3.s[1]\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, b2a.4s, a0.s[1]\n"
+                "fmla    v22.4s, b2a.4s, a1.s[1]\n"
+                "add    %[a_ptr3], %[a_ptr3], #32\n"
+                "fmla    v26.4s, b2a.4s, a2.s[1]\n"
+                "fmla    v30.4s, b2a.4s, a3.s[1]\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, b3a.4s, a0.s[1]\n"
+                "fmla    v23.4s, b3a.4s, a1.s[1]\n"
+                "fmla    v27.4s, b3a.4s, a2.s[1]\n"
+                "fmla    v31.4s, b3a.4s, a3.s[1]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+
+                // Unroll 2
+                "fmla    v16.4s, bb0.4s, a0.s[2]\n"
+                "fmla    v20.4s, bb0.4s, a1.s[2]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, bb0.4s, a2.s[2]\n"
+                "fmla    v28.4s, bb0.4s, a3.s[2]\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, bb1.4s, a0.s[2]\n"
+                "fmla    v21.4s, bb1.4s, a1.s[2]\n"
+                "fmla    v25.4s, bb1.4s, a2.s[2]\n"
+                "fmla    v29.4s, bb1.4s, a3.s[2]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, bb2.4s, a0.s[2]\n"
+                "fmla    v22.4s, bb2.4s, a1.s[2]\n"
+                "fmla    v26.4s, bb2.4s, a2.s[2]\n"
+                "fmla    v30.4s, bb2.4s, a3.s[2]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, bb3.4s, a0.s[2]\n"
+                "fmla    v23.4s, bb3.4s, a1.s[2]\n"
+                "fmla    v27.4s, bb3.4s, a2.s[2]\n"
+                "fmla    v31.4s, bb3.4s, a3.s[2]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                // Unroll 3
+                "fmla    v16.4s, b0a.4s, a0.s[3]\n"
+                "fmla    v20.4s, b0a.4s, a1.s[3]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, b0a.4s, a2.s[3]\n"
+                "fmla    v28.4s, b0a.4s, a3.s[3]\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, b1a.4s, a0.s[3]\n"
+                "fmla    v21.4s, b1a.4s, a1.s[3]\n"
+                "fmla    v25.4s, b1a.4s, a2.s[3]\n"
+                "fmla    v29.4s, b1a.4s, a3.s[3]\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, b2a.4s, a0.s[3]\n"
+                "fmla    v22.4s, b2a.4s, a1.s[3]\n"
+                "fmla    v26.4s, b2a.4s, a2.s[3]\n"
+                "fmla    v30.4s, b2a.4s, a3.s[3]\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, b3a.4s, a0.s[3]\n"
+                "fmla    v23.4s, b3a.4s, a1.s[3]\n"
+                "fmla    v27.4s, b3a.4s, a2.s[3]\n"
+                "fmla    v31.4s, b3a.4s, a3.s[3]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+
+                // Unroll 4
+                "fmla    v16.4s, bb0.4s, a0a.s[0]\n"
+                "fmla    v20.4s, bb0.4s, a1a.s[0]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, bb0.4s, a2a.s[0]\n"
+                "fmla    v28.4s, bb0.4s, a3a.s[0]\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, bb1.4s, a0a.s[0]\n"
+                "fmla    v21.4s, bb1.4s, a1a.s[0]\n"
+                "fmla    v25.4s, bb1.4s, a2a.s[0]\n"
+                "fmla    v29.4s, bb1.4s, a3a.s[0]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, bb2.4s, a0a.s[0]\n"
+                "fmla    v22.4s, bb2.4s, a1a.s[0]\n"
+                "fmla    v26.4s, bb2.4s, a2a.s[0]\n"
+                "fmla    v30.4s, bb2.4s, a3a.s[0]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, bb3.4s, a0a.s[0]\n"
+                "fmla    v23.4s, bb3.4s, a1a.s[0]\n"
+                "fmla    v27.4s, bb3.4s, a2a.s[0]\n"
+                "fmla    v31.4s, bb3.4s, a3a.s[0]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                // Unroll 5
+                "fmla    v16.4s, b0a.4s, a0a.s[1]\n"
+                "fmla    v20.4s, b0a.4s, a1a.s[1]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, b0a.4s, a2a.s[1]\n"
+                "fmla    v28.4s, b0a.4s, a3a.s[1]\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, b1a.4s, a0a.s[1]\n"
+                "fmla    v21.4s, b1a.4s, a1a.s[1]\n"
+                "fmla    v25.4s, b1a.4s, a2a.s[1]\n"
+                "fmla    v29.4s, b1a.4s, a3a.s[1]\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, b2a.4s, a0a.s[1]\n"
+                "fmla    v22.4s, b2a.4s, a1a.s[1]\n"
+                "fmla    v26.4s, b2a.4s, a2a.s[1]\n"
+                "fmla    v30.4s, b2a.4s, a3a.s[1]\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, b3a.4s, a0a.s[1]\n"
+                "fmla    v23.4s, b3a.4s, a1a.s[1]\n"
+                "fmla    v27.4s, b3a.4s, a2a.s[1]\n"
+                "fmla    v31.4s, b3a.4s, a3a.s[1]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+
+                // Unroll 6
+                "fmla    v16.4s, bb0.4s, a0a.s[2]\n"
+                "fmla    v20.4s, bb0.4s, a1a.s[2]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, bb0.4s, a2a.s[2]\n"
+                "fmla    v28.4s, bb0.4s, a3a.s[2]\n"
+
+                "fmla    v17.4s, bb1.4s, a0a.s[2]\n"
+                "fmla    v21.4s, bb1.4s, a1a.s[2]\n"
+                "fmla    v25.4s, bb1.4s, a2a.s[2]\n"
+                "fmla    v29.4s, bb1.4s, a3a.s[2]\n"
+
+                "fmla    v18.4s, bb2.4s, a0a.s[2]\n"
+                "fmla    v22.4s, bb2.4s, a1a.s[2]\n"
+                "fmla    v26.4s, bb2.4s, a2a.s[2]\n"
+                "fmla    v30.4s, bb2.4s, a3a.s[2]\n"
+
+                "fmla    v19.4s, bb3.4s, a0a.s[2]\n"
+                "fmla    v23.4s, bb3.4s, a1a.s[2]\n"
+                "fmla    v27.4s, bb3.4s, a2a.s[2]\n"
+                "fmla    v31.4s, bb3.4s, a3a.s[2]\n"
+
+                // Unroll 7
+                "fmla    v16.4s, b0a.4s, a0a.s[3]\n"
+                "fmla    v17.4s, b1a.4s, a0a.s[3]\n"
+                "fmla    v18.4s, b2a.4s, a0a.s[3]\n"
+                "fmla    v19.4s, b3a.4s, a0a.s[3]\n"
+                "cbnz    %w[odds], 6f\n"
+
+                "fmla    v20.4s, b0a.4s, a1a.s[3]\n"
+                "str    q16, [%[c_ptr0]]\n"
+                "fmla    v21.4s, b1a.4s, a1a.s[3]\n"
+                "str    q17, [%[c_ptr0], #16]\n"
+                "fmla    v22.4s, b2a.4s, a1a.s[3]\n"
+                "str    q18, [%[c_ptr0], #32]\n"
+                "fmla    v23.4s, b3a.4s, a1a.s[3]\n"
+                "str    q19, [%[c_ptr0], #48]\n"
+
+                "fmla    v24.4s, b0a.4s, a2a.s[3]\n"
+                "str    q20, [%[c_ptr1]]\n"
+                "fmla    v25.4s, b1a.4s, a2a.s[3]\n"
+                "str    q21, [%[c_ptr1], #16]\n"
+                "fmla    v26.4s, b2a.4s, a2a.s[3]\n"
+                "str    q22, [%[c_ptr1], #32]\n"
+                "fmla    v27.4s, b3a.4s, a2a.s[3]\n"
+                "str    q23, [%[c_ptr1], #48]\n"
+
+                "fmla    v28.4s, b0a.4s, a3a.s[3]\n"
+                "str    q24, [%[c_ptr2]]\n"
+                "fmla    v29.4s, b1a.4s, a3a.s[3]\n"
+                "str    q25, [%[c_ptr2], #16]\n"
+                "fmla    v30.4s, b2a.4s, a3a.s[3]\n"
+                "str    q26, [%[c_ptr2], #32]\n"
+                "fmla    v31.4s, b3a.4s, a3a.s[3]\n"
+                "str    q27, [%[c_ptr2], #48]\n"
+                "b    3f\n"
+
+                // Odd K case: Just do 4 more.
+                "2:\n"
+                "fmla    v21.4s, bb1.4s, a1.s[0]\n"
+                "add    %[a_ptr0], %[a_ptr0], #16\n"
+                "fmla    v25.4s, bb1.4s, a2.s[0]\n"
+                "add    %[a_ptr1], %[a_ptr1], #16\n"
+                "fmla    v29.4s, bb1.4s, a3.s[0]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, bb2.4s, a0.s[0]\n"
+                "add    %[a_ptr2], %[a_ptr2], #16\n"
+                "fmla    v22.4s, bb2.4s, a1.s[0]\n"
+                "add    %[a_ptr3], %[a_ptr3], #16\n"
+                "fmla    v26.4s, bb2.4s, a2.s[0]\n"
+                "fmla    v30.4s, bb2.4s, a3.s[0]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, bb3.4s, a0.s[0]\n"
+                "fmla    v23.4s, bb3.4s, a1.s[0]\n"
+                "fmla    v27.4s, bb3.4s, a2.s[0]\n"
+                "fmla    v31.4s, bb3.4s, a3.s[0]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                // Unroll 1
+                "fmla    v16.4s, b0a.4s, a0.s[1]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v20.4s, b0a.4s, a1.s[1]\n"
+                "fmla    v24.4s, b0a.4s, a2.s[1]\n"
+                "fmla    v28.4s, b0a.4s, a3.s[1]\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, b1a.4s, a0.s[1]\n"
+                "fmla    v21.4s, b1a.4s, a1.s[1]\n"
+                "fmla    v25.4s, b1a.4s, a2.s[1]\n"
+                "fmla    v29.4s, b1a.4s, a3.s[1]\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, b2a.4s, a0.s[1]\n"
+                "fmla    v22.4s, b2a.4s, a1.s[1]\n"
+                "fmla    v26.4s, b2a.4s, a2.s[1]\n"
+                "fmla    v30.4s, b2a.4s, a3.s[1]\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, b3a.4s, a0.s[1]\n"
+                "fmla    v23.4s, b3a.4s, a1.s[1]\n"
+                "fmla    v27.4s, b3a.4s, a2.s[1]\n"
+                "fmla    v31.4s, b3a.4s, a3.s[1]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+
+                // Unroll 2
+                "fmla    v16.4s, bb0.4s, a0.s[2]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v20.4s, bb0.4s, a1.s[2]\n"
+                "fmla    v24.4s, bb0.4s, a2.s[2]\n"
+                "fmla    v28.4s, bb0.4s, a3.s[2]\n"
+
+                "fmla    v17.4s, bb1.4s, a0.s[2]\n"
+                "fmla    v21.4s, bb1.4s, a1.s[2]\n"
+                "fmla    v25.4s, bb1.4s, a2.s[2]\n"
+                "fmla    v29.4s, bb1.4s, a3.s[2]\n"
+
+                "fmla    v18.4s, bb2.4s, a0.s[2]\n"
+                "fmla    v22.4s, bb2.4s, a1.s[2]\n"
+                "fmla    v26.4s, bb2.4s, a2.s[2]\n"
+                "fmla    v30.4s, bb2.4s, a3.s[2]\n"
+
+                "fmla    v19.4s, bb3.4s, a0.s[2]\n"
+                "fmla    v23.4s, bb3.4s, a1.s[2]\n"
+                "fmla    v27.4s, bb3.4s, a2.s[2]\n"
+                "fmla    v31.4s, bb3.4s, a3.s[2]\n"
+
+                // Unroll 3
+                "fmla    v16.4s, b0a.4s, a0.s[3]\n"
+                "fmla    v17.4s, b1a.4s, a0.s[3]\n"
+                "fmla    v18.4s, b2a.4s, a0.s[3]\n"
+                "fmla    v19.4s, b3a.4s, a0.s[3]\n"
+                "cbnz    %w[odds], 7f\n"
+
+                "fmla    v20.4s, b0a.4s, a1.s[3]\n"
+                "str    q16, [%[c_ptr0]]\n"
+                "fmla    v21.4s, b1a.4s, a1.s[3]\n"
+                "str    q17, [%[c_ptr0], #16]\n"
+                "fmla    v22.4s, b2a.4s, a1.s[3]\n"
+                "str    q18, [%[c_ptr0], #32]\n"
+                "fmla    v23.4s, b3a.4s, a1.s[3]\n"
+                "str    q19, [%[c_ptr0], #48]\n"
+
+                "fmla    v24.4s, b0a.4s, a2.s[3]\n"
+                "str    q20, [%[c_ptr1]]\n"
+                "fmla    v25.4s, b1a.4s, a2.s[3]\n"
+                "str    q21, [%[c_ptr1], #16]\n"
+                "fmla    v26.4s, b2a.4s, a2.s[3]\n"
+                "str    q22, [%[c_ptr1], #32]\n"
+                "fmla    v27.4s, b3a.4s, a2.s[3]\n"
+                "str    q23, [%[c_ptr1], #48]\n"
+
+                "fmla    v28.4s, b0a.4s, a3.s[3]\n"
+                "str    q24, [%[c_ptr2]]\n"
+                "fmla    v29.4s, b1a.4s, a3.s[3]\n"
+                "str    q25, [%[c_ptr2], #16]\n"
+                "fmla    v30.4s, b2a.4s, a3.s[3]\n"
+                "str    q26, [%[c_ptr2], #32]\n"
+                "fmla    v31.4s, b3a.4s, a3.s[3]\n"
+                "str    q27, [%[c_ptr2], #48]\n"
+                "b    3f\n"
+
+                // "Odd ones" - lead in from even
+                "6:\n"
+                "fmla    v20.4s, b0a.4s, a1a.s[3]\n"
+                "fmla    v21.4s, b1a.4s, a1a.s[3]\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+                "fmla    v22.4s, b2a.4s, a1a.s[3]\n"
+                "subs    %w[odds], %w[odds], #1\n"
+                "fmla    v23.4s, b3a.4s, a1a.s[3]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v24.4s, b0a.4s, a2a.s[3]\n"
+                "fmla    v25.4s, b1a.4s, a2a.s[3]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+                "fmla    v26.4s, b2a.4s, a2a.s[3]\n"
+                "fmla    v27.4s, b3a.4s, a2a.s[3]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                "fmla    v28.4s, b0a.4s, a3a.s[3]\n"
+                "ld1r    {a0.4s}, [%[a_ptr0]], #4\n"
+                "fmla    v29.4s, b1a.4s, a3a.s[3]\n"
+                "fmla    v30.4s, b2a.4s, a3a.s[3]\n"
+                "ld1r    {a1.4s}, [%[a_ptr1]], #4\n"
+                "fmla    v31.4s, b3a.4s, a3a.s[3]\n"
+
+                "fmla    v16.4s, bb0.4s, a0.4s\n"
+                "beq    9f\n"
+                "b    8f\n"
+
+                // "Odd ones" - lead in from odd
+                "7:\n"
+                "fmla    v20.4s, b0a.4s, a1.s[3]\n"
+                "subs    %w[odds], %w[odds], #1\n"
+                "fmla    v21.4s, b1a.4s, a1.s[3]\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+                "fmla    v22.4s, b2a.4s, a1.s[3]\n"
+                "fmla    v23.4s, b3a.4s, a1.s[3]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v24.4s, b0a.4s, a2.s[3]\n"
+                "fmla    v25.4s, b1a.4s, a2.s[3]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+                "fmla    v26.4s, b2a.4s, a2.s[3]\n"
+                "fmla    v27.4s, b3a.4s, a2.s[3]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                "fmla    v28.4s, b0a.4s, a3.s[3]\n"
+                "ld1r    {a0.4s}, [%[a_ptr0]], #4\n"
+                "fmla    v29.4s, b1a.4s, a3.s[3]\n"
+                "fmla    v30.4s, b2a.4s, a3.s[3]\n"
+                "ld1r    {a1.4s}, [%[a_ptr1]], #4\n"
+                "fmla    v31.4s, b3a.4s, a3.s[3]\n"
+
+                "fmla    v16.4s, bb0.4s, a0.4s\n"
+                "beq    9f\n"
+
+                // "Odd ones" - loop
+                "8:\n"
+                "fmla    v17.4s, bb1.4s, a0.4s\n"
+                "ld1r    {a2.4s}, [%[a_ptr2]], #4\n"
+                "fmla    v18.4s, bb2.4s, a0.4s\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v19.4s, bb3.4s, a0.4s\n"
+                "ld1r    {a3.4s}, [%[a_ptr3]], #4\n"
+
+                "fmla    v20.4s, bb0.4s, a1.4s\n"
+                "subs    %w[odds], %w[odds], #1\n"
+                "fmla    v21.4s, bb1.4s, a1.4s\n"
+                "ld1r    {a0.4s}, [%[a_ptr0]], #4\n"
+                "fmla    v22.4s, bb2.4s, a1.4s\n"
+                "fmla    v23.4s, bb3.4s, a1.4s\n"
+                "ld1r    {a1.4s}, [%[a_ptr1]], #4\n"
+
+                "fmla    v24.4s, bb0.4s, a2.4s\n"
+                "fmla    v28.4s, bb0.4s, a3.4s\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+                "fmla    v25.4s, bb1.4s, a2.4s\n"
+                "fmla    v29.4s, bb1.4s, a3.4s\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v26.4s, bb2.4s, a2.4s\n"
+                "fmla    v30.4s, bb2.4s, a3.4s\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+                "fmla    v27.4s, bb3.4s, a2.4s\n"
+                "fmla    v31.4s, bb3.4s, a3.4s\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+                "fmla    v16.4s, bb0.4s, a0.4s\n"
+                "bne    8b\n"
+
+                // "Odd ones" - detached final iteration
+                "9:\n"
+                "fmla    v17.4s, bb1.4s, a0.4s\n"
+                "ld1r    {a2.4s}, [%[a_ptr2]], #4\n"
+                "fmla    v18.4s, bb2.4s, a0.4s\n"
+                "fmla    v19.4s, bb3.4s, a0.4s\n"
+                "ld1r    {a3.4s}, [%[a_ptr3]], #4\n"
+
+                "fmla    v20.4s, bb0.4s, a1.4s\n"
+                "str    q16, [%[c_ptr0]]\n"
+                "fmla    v21.4s, bb1.4s, a1.4s\n"
+                "str    q17, [%[c_ptr0], #16]\n"
+                "fmla    v22.4s, bb2.4s, a1.4s\n"
+                "str    q18, [%[c_ptr0], #32]\n"
+                "fmla    v23.4s, bb3.4s, a1.4s\n"
+                "str    q19, [%[c_ptr0], #48]\n"
+
+                "fmla    v24.4s, bb0.4s, a2.4s\n"
+                "str    q20, [%[c_ptr1]]\n"
+                "fmla    v25.4s, bb1.4s, a2.4s\n"
+                "str    q21, [%[c_ptr1], #16]\n"
+                "fmla    v26.4s, bb2.4s, a2.4s\n"
+                "str    q22, [%[c_ptr1], #32]\n"
+                "fmla    v27.4s, bb3.4s, a2.4s\n"
+                "str    q23, [%[c_ptr1], #48]\n"
+
+                "fmla    v28.4s, bb0.4s, a3.4s\n"
+                "str    q24, [%[c_ptr2]]\n"
+                "fmla    v29.4s, bb1.4s, a3.4s\n"
+                "str    q25, [%[c_ptr2], #16]\n"
+                "fmla    v30.4s, bb2.4s, a3.4s\n"
+                "str    q26, [%[c_ptr2], #32]\n"
+                "fmla    v31.4s, bb3.4s, a3.4s\n"
+                "str    q27, [%[c_ptr2], #48]\n"
+
+                "3:\n"
+                "str    q28, [%[c_ptr3]]\n"
+                "str    q29, [%[c_ptr3], #16]\n"
+                "str    q30, [%[c_ptr3], #32]\n"
+                "str    q31, [%[c_ptr3], #48]\n"
+
+                : [a_ptr0] "+r"(a_ptr0), [a_ptr1] "+r"(a_ptr1), [a_ptr2] "+r"(a_ptr2), [a_ptr3] "+r"(a_ptr3),
+                [b_ptr] "+r"(b_ptr), [loops] "+r"(loops), [odds] "+r"(odds)
+                : [ldb] "r"(ldbb), [oddk] "r"(oddk), [beta0] "r"(beta0), [betaptr] "r"(&beta),
+                [c_ptr0] "r"(c_ptr0), [c_ptr1] "r"(c_ptr1), [c_ptr2] "r"(c_ptr2), [c_ptr3] "r"(c_ptr3)
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+                "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+                "cc", "memory");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
new file mode 100644
index 0000000..c89514f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp

@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_sgemv_pretransposed(const float *, int, const float *, float *, float, int, int);
+
+// Pretransposed SGEMV strategy class.
+class sgemv_pretransposed
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, int, const float *, float *, float, int, int);
+
+    /* Describes the data layout for matrix (A) input */
+
+    /* Note that often GEMV is expressed as a GEMM with M=1, i.e.  A is the
+     * (row) vector and B is the matrix, but the standard GEMV arrangement
+     * is matrix A times (column) vector X.  "A_transpose" is expressed in
+     * terms of this standard arrangement, so if the A matrix is in fact the
+     * B matrix from a GEMM call, the sense of the transpose needs to be
+     * reversed.  */
+    static const int  A_interleave = 32;
+    static const int  A_block      = 1;
+    static const bool A_transpose  = false;
+
+    /* Kernel blocking parameters */
+    static const int out_width = 32;
+    static const int k_unroll  = 1;
+
+    kern_type kernel = a64_sgemv_pretransposed;
+
+    sgemv_pretransposed(const CPUInfo *ci)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp
new file mode 100644
index 0000000..2907598
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp

@@ -0,0 +1,794 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm
+{
+void a64_sgemv_pretransposed(const float *A, int lda, const float *X, float *Y, float beta, int M, int N)
+{
+    const bool beta0 = (beta == 0.0f);
+    const bool beta1 = (beta == 1.0f);
+
+    for(int x = 0; x < N; x += 32)
+    {
+        float *y_ptr = Y + x;
+
+        // How many elements are we processing in this loop?
+        int l = std::min(N - x, 32);
+
+        register float32x4_t r0 asm("v24");
+        register float32x4_t r1 asm("v25");
+        register float32x4_t r2 asm("v26");
+        register float32x4_t r3 asm("v27");
+        register float32x4_t r4 asm("v28");
+        register float32x4_t r5 asm("v29");
+        register float32x4_t r6 asm("v30");
+        register float32x4_t r7 asm("v31");
+
+        register float32x4_t x0 asm("v0");
+        register float32x4_t x0a asm("v1");
+
+        const float *x_ptr = X;
+        const float *a_ptr = A + ((x / 32) * lda);
+
+        if(beta0)
+        {
+            r0 = r1 = r2 = r3 = r4 = r5 = r6 = r7 = vdupq_n_f32(0.0f);
+        }
+        else
+        {
+            if(l == 32)
+            {
+                // Fastest path - load all 8 vectors
+                r0 = vld1q_f32(y_ptr);
+                r1 = vld1q_f32(y_ptr + 4);
+                r2 = vld1q_f32(y_ptr + 8);
+                r3 = vld1q_f32(y_ptr + 12);
+                r4 = vld1q_f32(y_ptr + 16);
+                r5 = vld1q_f32(y_ptr + 20);
+                r6 = vld1q_f32(y_ptr + 24);
+                r7 = vld1q_f32(y_ptr + 28);
+            }
+            else
+            {
+                // Slow case - leftovers.  Note that we don't care about
+                // out-of-range vectors and lanes as we will throw them away at
+                // the end.
+                int vecs    = l / 4; // How many leftover vectors?
+                int oddbits = l % 4; // And how many odd single values?
+
+                if(oddbits)
+                {
+                    // Load the outstanding odd values into a vector first
+                    float32x4_t oddvec  = vdupq_n_f32(0.0f); // This does not really need to be initialized, but the compiler has a hard time with that.
+                    float      *oddbase = y_ptr + l - oddbits;
+
+                    switch(oddbits)
+                    {
+                        case 3:
+                            oddvec = vld1q_lane_f32(oddbase + 2, oddvec, 2);
+                        // fall through
+                        case 2:
+                            oddvec = vld1q_lane_f32(oddbase + 1, oddvec, 1);
+                        // fall through
+                        case 1:
+                            oddvec = vld1q_lane_f32(oddbase, oddvec, 0);
+                            break;
+
+                        default:
+                            UNREACHABLE("Impossible case in switch.");
+                    }
+
+                    // Now load the whole vectors, putting the oddments in when we run out.
+                    do
+                    {
+                        if(vecs == 0)
+                        {
+                            r0 = oddvec;
+                            break;
+                        }
+
+                        r0 = vld1q_f32(y_ptr);
+                        if(--vecs == 0)
+                        {
+                            r1 = oddvec;
+                            break;
+                        }
+
+                        r1 = vld1q_f32(y_ptr + 4);
+                        if(--vecs == 0)
+                        {
+                            r2 = oddvec;
+                            break;
+                        }
+
+                        r2 = vld1q_f32(y_ptr + 8);
+                        if(--vecs == 0)
+                        {
+                            r3 = oddvec;
+                            break;
+                        }
+
+                        r3 = vld1q_f32(y_ptr + 12);
+                        if(--vecs == 0)
+                        {
+                            r4 = oddvec;
+                            break;
+                        }
+
+                        r4 = vld1q_f32(y_ptr + 16);
+                        if(--vecs == 0)
+                        {
+                            r5 = oddvec;
+                            break;
+                        }
+
+                        r5 = vld1q_f32(y_ptr + 20);
+                        if(--vecs == 0)
+                        {
+                            r6 = oddvec;
+                            break;
+                        }
+
+                        r6 = vld1q_f32(y_ptr + 24);
+                        r7 = oddvec;
+                    }
+                    while(0);
+                }
+                else
+                {
+                    // Slightly less slow path - just load the whole vectors
+                    do
+                    {
+                        // It can't be the case that oddbits==0 AND vecs==0 or we wouldn't be here.
+                        if(vecs == 0)
+                        {
+                            UNREACHABLE("Impossible lack of work to do");
+                        }
+
+                        r0 = vld1q_f32(y_ptr);
+                        if(--vecs == 0)
+                        {
+                            break;
+                        }
+
+                        r1 = vld1q_f32(y_ptr + 4);
+                        if(--vecs == 0)
+                        {
+                            break;
+                        }
+
+                        r2 = vld1q_f32(y_ptr + 8);
+                        if(--vecs == 0)
+                        {
+                            break;
+                        }
+
+                        r3 = vld1q_f32(y_ptr + 12);
+                        if(--vecs == 0)
+                        {
+                            break;
+                        }
+
+                        r4 = vld1q_f32(y_ptr + 16);
+                        if(--vecs == 0)
+                        {
+                            break;
+                        }
+
+                        r5 = vld1q_f32(y_ptr + 20);
+                        if(--vecs == 0)
+                        {
+                            break;
+                        }
+
+                        r6 = vld1q_f32(y_ptr + 24);
+                    }
+                    while(0);
+                }
+            }
+
+            if(!beta1)
+            {
+                const float32x4_t vb = vdupq_n_f32(beta);
+
+                r0 = vmulq_f32(r0, vb);
+                r1 = vmulq_f32(r1, vb);
+                r2 = vmulq_f32(r2, vb);
+                r3 = vmulq_f32(r3, vb);
+                r4 = vmulq_f32(r4, vb);
+                r5 = vmulq_f32(r5, vb);
+                r6 = vmulq_f32(r6, vb);
+                r7 = vmulq_f32(r7, vb);
+            }
+        }
+
+        if(M >= 8)
+        {
+            int k = (M / 8) - 1;
+            x0    = vld1q_f32(x_ptr);
+
+            __asm __volatile(
+                "ldr    q2, [%[a_ptr], #0]\n"
+                "ldr    q3, [%[a_ptr], #16]\n"
+                "ldr    q4, [%[a_ptr], #32]\n"
+                "ldr    q5, [%[a_ptr], #48]\n"
+                "ldr    q6, [%[a_ptr], #64]\n"
+                "ldr    q7, [%[a_ptr], #80]\n"
+                "ldr    q8, [%[a_ptr], #96]\n"
+                "ldr    q9, [%[a_ptr], #112]\n"
+                "ldr    q10, [%[a_ptr], #128]\n"
+                "ldr    q11, [%[a_ptr], #144]\n"
+                "ldr    q12, [%[a_ptr], #160]\n"
+                "ldr    q13, [%[a_ptr], #176]\n"
+                "ldr    q14, [%[a_ptr], #192]\n"
+                "ldr    q15, [%[a_ptr], #208]\n"
+                "ldr    q16, [%[a_ptr], #224]\n"
+                "ldr    q17, [%[a_ptr], #240]\n"
+                "ldr    q18, [%[a_ptr], #256]\n"
+                "ldr    q19, [%[a_ptr], #272]\n"
+                "ldr    q20, [%[a_ptr], #288]\n"
+                "ldr    q21, [%[a_ptr], #304]\n"
+                "ldr    q22, [%[a_ptr], #320]\n"
+                "ldr    q23, [%[a_ptr], #336]\n" ASM_PREFETCH("[%[a_ptr], #384]")
+                ASM_PREFETCH("[%[a_ptr], #448]")
+                ASM_PREFETCH("[%[a_ptr], #512]")
+                ASM_PREFETCH("[%[a_ptr], #576]")
+                ASM_PREFETCH("[%[a_ptr], #640]")
+                ASM_PREFETCH("[%[a_ptr], #704]")
+                ASM_PREFETCH("[%[a_ptr], #768]")
+                ASM_PREFETCH("[%[a_ptr], #832]")
+                ASM_PREFETCH("[%[a_ptr], #896]")
+                ASM_PREFETCH("[%[a_ptr], #960]")
+                ASM_PREFETCH("[%[a_ptr], #1024]")
+                ASM_PREFETCH("[%[a_ptr], #1088]")
+                ASM_PREFETCH("[%[a_ptr], #1152]")
+                ASM_PREFETCH("[%[a_ptr], #1216]")
+                ASM_PREFETCH("[%[a_ptr], #1280]")
+                ASM_PREFETCH("[%[a_ptr], #1344]")
+                ASM_PREFETCH("[%[a_ptr], #1408]")
+                ASM_PREFETCH("[%[a_ptr], #1472]")
+                ASM_PREFETCH("[%[a_ptr], #1536]")
+                ASM_PREFETCH("[%[a_ptr], #1600]")
+                ASM_PREFETCH("[%[a_ptr], #1664]")
+                ASM_PREFETCH("[%[a_ptr], #1728]")
+                ASM_PREFETCH("[%[a_ptr], #1792]")
+                ASM_PREFETCH("[%[a_ptr], #1856]")
+                ASM_PREFETCH("[%[a_ptr], #1920]")
+                ASM_PREFETCH("[%[a_ptr], #1984]")
+                "add    %[a_ptr], %[a_ptr], #352\n"
+
+                "cbz    %w[k], 2f\n"
+
+                "1:\n"
+                // Unroll 0
+                "fmla    %[r0].4s, v2.4s, %[x0].s[0]\n"
+                "ldr    %q[x0a], [%[x_ptr], #16]\n"
+                "fmla    %[r1].4s, v3.4s, %[x0].s[0]\n"
+                "ldr    q3, [%[a_ptr], #0]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    %[r2].4s, v4.4s, %[x0].s[0]\n"
+                "ldr    q4, [%[a_ptr], #16]\n"
+                "fmla    %[r3].4s, v5.4s, %[x0].s[0]\n"
+                "ldr    q5, [%[a_ptr], #32]\n"
+                "add    %[x_ptr], %[x_ptr], #32\n" ASM_PREFETCH("[%[a_ptr], #1664]")
+                "fmla    %[r4].4s, v6.4s, %[x0].s[0]\n"
+                "ldr    q6, [%[a_ptr], #48]\n"
+                "fmla    %[r5].4s, v7.4s, %[x0].s[0]\n"
+                "ldr    q7, [%[a_ptr], #64]\n"
+                "fmla    %[r6].4s, v8.4s, %[x0].s[0]\n"
+                "ldr    q8, [%[a_ptr], #80]\n"
+                "fmla    %[r7].4s, v9.4s, %[x0].s[0]\n"
+                "ldr    q9, [%[a_ptr], #96]\n" ASM_PREFETCH("[%[a_ptr], #1728]")
+
+                // Unroll 1
+                "fmla    %[r0].4s, v10.4s, %[x0].s[1]\n"
+                "ldr    q10, [%[a_ptr], #112]\n"
+                "fmla    %[r1].4s, v11.4s, %[x0].s[1]\n"
+                "ldr    q11, [%[a_ptr], #128]\n"
+                "fmla    %[r2].4s, v12.4s, %[x0].s[1]\n"
+                "ldr    q12, [%[a_ptr], #144]\n"
+                "fmla    %[r3].4s, v13.4s, %[x0].s[1]\n"
+                "ldr    q13, [%[a_ptr], #160]\n" ASM_PREFETCH("[%[a_ptr], #1792]")
+                "fmla    %[r4].4s, v14.4s, %[x0].s[1]\n"
+                "ldr    q14, [%[a_ptr], #176]\n"
+                "fmla    %[r5].4s, v15.4s, %[x0].s[1]\n"
+                "ldr    q15, [%[a_ptr], #192]\n"
+                "fmla    %[r6].4s, v16.4s, %[x0].s[1]\n"
+                "ldr    q16, [%[a_ptr], #208]\n"
+                "fmla    %[r7].4s, v17.4s, %[x0].s[1]\n"
+                "ldr    q17, [%[a_ptr], #224]\n" ASM_PREFETCH("[%[a_ptr], #1856]")
+
+                // Unroll 2
+                "fmla    %[r0].4s, v18.4s, %[x0].s[2]\n"
+                "ldr    q18, [%[a_ptr], #240]\n"
+                "fmla    %[r1].4s, v19.4s, %[x0].s[2]\n"
+                "ldr    q19, [%[a_ptr], #256]\n"
+                "fmla    %[r2].4s, v20.4s, %[x0].s[2]\n"
+                "ldr    q20, [%[a_ptr], #272]\n"
+                "fmla    %[r3].4s, v21.4s, %[x0].s[2]\n"
+                "ldr    q21, [%[a_ptr], #288]\n" ASM_PREFETCH("[%[a_ptr], #1920]")
+                "fmla    %[r4].4s, v22.4s, %[x0].s[2]\n"
+                "ldr    q22, [%[a_ptr], #304]\n"
+                "fmla    %[r5].4s, v23.4s, %[x0].s[2]\n"
+                "ldr    q23, [%[a_ptr], #320]\n"
+                "fmla    %[r6].4s, v3.4s, %[x0].s[2]\n"
+                "ldr    q2, [%[a_ptr], #336]\n"
+                "ldr    q3, [%[a_ptr], #352]\n"
+                "fmla    %[r7].4s, v4.4s, %[x0].s[2]\n"
+                "ldr    q4, [%[a_ptr], #368]\n" ASM_PREFETCH("[%[a_ptr], #1984]")
+
+                // Unroll 3
+                "fmla    %[r0].4s, v5.4s, %[x0].s[3]\n"
+                "ldr    q5, [%[a_ptr], #384]\n"
+                "fmla    %[r1].4s, v6.4s, %[x0].s[3]\n"
+                "ldr    q6, [%[a_ptr], #400]\n"
+                "fmla    %[r2].4s, v7.4s, %[x0].s[3]\n"
+                "ldr    q7, [%[a_ptr], #416]\n"
+                "fmla    %[r3].4s, v8.4s, %[x0].s[3]\n" ASM_PREFETCH("[%[a_ptr], #2048]")
+                "ldr    q8, [%[a_ptr], #432]\n"
+                "fmla    %[r4].4s, v9.4s, %[x0].s[3]\n"
+                "ldr    q9, [%[a_ptr], #448]\n"
+                "fmla    %[r5].4s, v10.4s, %[x0].s[3]\n"
+                "ldr    q10, [%[a_ptr], #464]\n"
+                "fmla    %[r6].4s, v11.4s, %[x0].s[3]\n"
+                "ldr    q11, [%[a_ptr], #480]\n"
+                "fmla    %[r7].4s, v12.4s, %[x0].s[3]\n"
+                "ldr    q12, [%[a_ptr], #496]\n" ASM_PREFETCH("[%[a_ptr], #2112]")
+
+                // Unroll 4
+                "fmla    %[r0].4s, v13.4s, %[x0a].s[0]\n"
+                "ldr    %q[x0], [%[x_ptr]]\n"
+                "fmla    %[r1].4s, v14.4s, %[x0a].s[0]\n"
+                "ldr    q14, [%[a_ptr], #512]\n"
+                "fmla    %[r2].4s, v15.4s, %[x0a].s[0]\n"
+                "ldr    q15, [%[a_ptr], #528]\n"
+                "fmla    %[r3].4s, v16.4s, %[x0a].s[0]\n" ASM_PREFETCH("[%[a_ptr], #2176]")
+                "ldr    q16, [%[a_ptr], #544]\n"
+                "fmla    %[r4].4s, v17.4s, %[x0a].s[0]\n"
+                "ldr    q17, [%[a_ptr], #560]\n"
+                "fmla    %[r5].4s, v18.4s, %[x0a].s[0]\n"
+                "ldr    q18, [%[a_ptr], #576]\n"
+                "fmla    %[r6].4s, v19.4s, %[x0a].s[0]\n"
+                "ldr    q19, [%[a_ptr], #592]\n"
+                "fmla    %[r7].4s, v20.4s, %[x0a].s[0]\n"
+                "ldr    q20, [%[a_ptr], #608]\n" ASM_PREFETCH("[%[a_ptr], #2240]")
+
+                // Unroll 5
+                "fmla    %[r0].4s, v21.4s, %[x0a].s[1]\n"
+                "ldr    q21, [%[a_ptr], #624]\n"
+                "fmla    %[r1].4s, v22.4s, %[x0a].s[1]\n"
+                "ldr    q22, [%[a_ptr], #640]\n"
+                "fmla    %[r2].4s, v23.4s, %[x0a].s[1]\n"
+                "ldr    q23, [%[a_ptr], #656]\n"
+                "fmla    %[r3].4s, v2.4s, %[x0a].s[1]\n"
+                "ldr    q2, [%[a_ptr], #672]\n" ASM_PREFETCH("[%[a_ptr], #2304]")
+                "fmla    %[r4].4s, v3.4s, %[x0a].s[1]\n"
+                "ldr    q3, [%[a_ptr], #688]\n"
+                "fmla    %[r5].4s, v4.4s, %[x0a].s[1]\n"
+                "ldr    q4, [%[a_ptr], #704]\n"
+                "fmla    %[r6].4s, v5.4s, %[x0a].s[1]\n"
+                "ldr    q5, [%[a_ptr], #720]\n"
+                "fmla    %[r7].4s, v6.4s, %[x0a].s[1]\n"
+                "ldr    q6, [%[a_ptr], #736]\n" ASM_PREFETCH("[%[a_ptr], #2368]")
+
+                // Unroll 6
+                "fmla    %[r0].4s, v7.4s, %[x0a].s[2]\n"
+                "ldr    q7, [%[a_ptr], #752]\n"
+                "fmla    %[r1].4s, v8.4s, %[x0a].s[2]\n"
+                "ldr    q8, [%[a_ptr], #768]\n"
+                "fmla    %[r2].4s, v9.4s, %[x0a].s[2]\n"
+                "ldr    q9, [%[a_ptr], #784]\n"
+                "fmla    %[r3].4s, v10.4s, %[x0a].s[2]\n"
+                "ldr    q10, [%[a_ptr], #800]\n" ASM_PREFETCH("[%[a_ptr], #2432]")
+                "fmla    %[r4].4s, v11.4s, %[x0a].s[2]\n"
+                "ldr    q11, [%[a_ptr], #816]\n"
+                "fmla    %[r5].4s, v12.4s, %[x0a].s[2]\n"
+                "ldr    q12, [%[a_ptr], #832]\n"
+                "fmla    %[r6].4s, v14.4s, %[x0a].s[2]\n"
+                "ldr    q13, [%[a_ptr], #848]\n"
+                "ldr    q14, [%[a_ptr], #864]\n"
+                "fmla    %[r7].4s, v15.4s, %[x0a].s[2]\n"
+                "ldr    q15, [%[a_ptr], #880]\n" ASM_PREFETCH("[%[a_ptr], #2496]")
+
+                // Unroll 7
+                "fmla    %[r0].4s, v16.4s, %[x0a].s[3]\n"
+                "ldr    q16, [%[a_ptr], #896]\n"
+                "fmla    %[r1].4s, v17.4s, %[x0a].s[3]\n"
+                "ldr    q17, [%[a_ptr], #912]\n"
+                "fmla    %[r2].4s, v18.4s, %[x0a].s[3]\n"
+                "ldr    q18, [%[a_ptr], #928]\n"
+                "fmla    %[r3].4s, v19.4s, %[x0a].s[3]\n" ASM_PREFETCH("[%[a_ptr], #2560]")
+                "ldr    q19, [%[a_ptr], #944]\n"
+                "fmla    %[r4].4s, v20.4s, %[x0a].s[3]\n"
+                "ldr    q20, [%[a_ptr], #960]\n"
+                "fmla    %[r5].4s, v21.4s, %[x0a].s[3]\n"
+                "ldr    q21, [%[a_ptr], #976]\n"
+                "add    %[a_ptr], %[a_ptr], #1024\n"
+                "fmla    %[r6].4s, v22.4s, %[x0a].s[3]\n"
+                "ldr    q22, [%[a_ptr], #-32]\n"
+                "fmla    %[r7].4s, v23.4s, %[x0a].s[3]\n"
+                "ldr    q23, [%[a_ptr], #-16]\n" ASM_PREFETCH("[%[a_ptr], #1600]")
+                "bne    1b\n"
+
+                // Detached final iteration
+                "2:\n"
+
+                // Unroll 0
+                "fmla    %[r0].4s, v2.4s, %[x0].s[0]\n"
+                "ldr    %q[x0a], [%[x_ptr], #16]\n"
+                "fmla    %[r1].4s, v3.4s, %[x0].s[0]\n"
+                "ldr    q3, [%[a_ptr], #0]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    %[r2].4s, v4.4s, %[x0].s[0]\n"
+                "ldr    q4, [%[a_ptr], #16]\n"
+                "fmla    %[r3].4s, v5.4s, %[x0].s[0]\n"
+                "ldr    q5, [%[a_ptr], #32]\n"
+                "add    %[x_ptr], %[x_ptr], #32\n"
+                "fmla    %[r4].4s, v6.4s, %[x0].s[0]\n"
+                "ldr    q6, [%[a_ptr], #48]\n"
+                "fmla    %[r5].4s, v7.4s, %[x0].s[0]\n"
+                "ldr    q7, [%[a_ptr], #64]\n"
+                "fmla    %[r6].4s, v8.4s, %[x0].s[0]\n"
+                "ldr    q8, [%[a_ptr], #80]\n"
+                "fmla    %[r7].4s, v9.4s, %[x0].s[0]\n"
+                "ldr    q9, [%[a_ptr], #96]\n"
+
+                // Unroll 1
+                "fmla    %[r0].4s, v10.4s, %[x0].s[1]\n"
+                "ldr    q10, [%[a_ptr], #112]\n"
+                "fmla    %[r1].4s, v11.4s, %[x0].s[1]\n"
+                "ldr    q11, [%[a_ptr], #128]\n"
+                "fmla    %[r2].4s, v12.4s, %[x0].s[1]\n"
+                "ldr    q12, [%[a_ptr], #144]\n"
+                "fmla    %[r3].4s, v13.4s, %[x0].s[1]\n"
+                "ldr    q13, [%[a_ptr], #160]\n"
+                "fmla    %[r4].4s, v14.4s, %[x0].s[1]\n"
+                "ldr    q14, [%[a_ptr], #176]\n"
+                "fmla    %[r5].4s, v15.4s, %[x0].s[1]\n"
+                "ldr    q15, [%[a_ptr], #192]\n"
+                "fmla    %[r6].4s, v16.4s, %[x0].s[1]\n"
+                "ldr    q16, [%[a_ptr], #208]\n"
+                "fmla    %[r7].4s, v17.4s, %[x0].s[1]\n"
+                "ldr    q17, [%[a_ptr], #224]\n"
+
+                // Unroll 2
+                "fmla    %[r0].4s, v18.4s, %[x0].s[2]\n"
+                "ldr    q18, [%[a_ptr], #240]\n"
+                "fmla    %[r1].4s, v19.4s, %[x0].s[2]\n"
+                "ldr    q19, [%[a_ptr], #256]\n"
+                "fmla    %[r2].4s, v20.4s, %[x0].s[2]\n"
+                "ldr    q20, [%[a_ptr], #272]\n"
+                "fmla    %[r3].4s, v21.4s, %[x0].s[2]\n"
+                "ldr    q21, [%[a_ptr], #288]\n"
+                "fmla    %[r4].4s, v22.4s, %[x0].s[2]\n"
+                "ldr    q22, [%[a_ptr], #304]\n"
+                "fmla    %[r5].4s, v23.4s, %[x0].s[2]\n"
+                "ldr    q23, [%[a_ptr], #320]\n"
+                "fmla    %[r6].4s, v3.4s, %[x0].s[2]\n"
+                "ldr    q2, [%[a_ptr], #336]\n"
+                "ldr    q3, [%[a_ptr], #352]\n"
+                "fmla    %[r7].4s, v4.4s, %[x0].s[2]\n"
+                "ldr    q4, [%[a_ptr], #368]\n"
+
+                // Unroll 3
+                "fmla    %[r0].4s, v5.4s, %[x0].s[3]\n"
+                "ldr    q5, [%[a_ptr], #384]\n"
+                "fmla    %[r1].4s, v6.4s, %[x0].s[3]\n"
+                "ldr    q6, [%[a_ptr], #400]\n"
+                "fmla    %[r2].4s, v7.4s, %[x0].s[3]\n"
+                "ldr    q7, [%[a_ptr], #416]\n"
+                "fmla    %[r3].4s, v8.4s, %[x0].s[3]\n"
+                "ldr    q8, [%[a_ptr], #432]\n"
+                "fmla    %[r4].4s, v9.4s, %[x0].s[3]\n"
+                "ldr    q9, [%[a_ptr], #448]\n"
+                "fmla    %[r5].4s, v10.4s, %[x0].s[3]\n"
+                "ldr    q10, [%[a_ptr], #464]\n"
+                "fmla    %[r6].4s, v11.4s, %[x0].s[3]\n"
+                "ldr    q11, [%[a_ptr], #480]\n"
+                "fmla    %[r7].4s, v12.4s, %[x0].s[3]\n"
+                "ldr    q12, [%[a_ptr], #496]\n"
+
+                // Unroll 4
+                "fmla    %[r0].4s, v13.4s, %[x0a].s[0]\n"
+                "fmla    %[r1].4s, v14.4s, %[x0a].s[0]\n"
+                "ldr    q14, [%[a_ptr], #512]\n"
+                "fmla    %[r2].4s, v15.4s, %[x0a].s[0]\n"
+                "ldr    q15, [%[a_ptr], #528]\n"
+                "fmla    %[r3].4s, v16.4s, %[x0a].s[0]\n"
+                "ldr    q16, [%[a_ptr], #544]\n"
+                "fmla    %[r4].4s, v17.4s, %[x0a].s[0]\n"
+                "ldr    q17, [%[a_ptr], #560]\n"
+                "fmla    %[r5].4s, v18.4s, %[x0a].s[0]\n"
+                "ldr    q18, [%[a_ptr], #576]\n"
+                "fmla    %[r6].4s, v19.4s, %[x0a].s[0]\n"
+                "ldr    q19, [%[a_ptr], #592]\n"
+                "fmla    %[r7].4s, v20.4s, %[x0a].s[0]\n"
+                "ldr    q20, [%[a_ptr], #608]\n"
+
+                // Unroll 5
+                "fmla    %[r0].4s, v21.4s, %[x0a].s[1]\n"
+                "ldr    q21, [%[a_ptr], #624]\n"
+                "fmla    %[r1].4s, v22.4s, %[x0a].s[1]\n"
+                "ldr    q22, [%[a_ptr], #640]\n"
+                "fmla    %[r2].4s, v23.4s, %[x0a].s[1]\n"
+                "ldr    q23, [%[a_ptr], #656]\n"
+                "fmla    %[r3].4s, v2.4s, %[x0a].s[1]\n"
+                "add    %[a_ptr], %[a_ptr], #672\n"
+                "fmla    %[r4].4s, v3.4s, %[x0a].s[1]\n"
+                "fmla    %[r5].4s, v4.4s, %[x0a].s[1]\n"
+                "fmla    %[r6].4s, v5.4s, %[x0a].s[1]\n"
+                "fmla    %[r7].4s, v6.4s, %[x0a].s[1]\n"
+
+                // Unroll 6
+                "fmla    %[r0].4s, v7.4s, %[x0a].s[2]\n"
+                "fmla    %[r1].4s, v8.4s, %[x0a].s[2]\n"
+                "fmla    %[r2].4s, v9.4s, %[x0a].s[2]\n"
+                "fmla    %[r3].4s, v10.4s, %[x0a].s[2]\n"
+                "fmla    %[r4].4s, v11.4s, %[x0a].s[2]\n"
+                "fmla    %[r5].4s, v12.4s, %[x0a].s[2]\n"
+                "fmla    %[r6].4s, v14.4s, %[x0a].s[2]\n"
+                "fmla    %[r7].4s, v15.4s, %[x0a].s[2]\n"
+
+                // Unroll 7
+                "fmla    %[r0].4s, v16.4s, %[x0a].s[3]\n"
+                "fmla    %[r1].4s, v17.4s, %[x0a].s[3]\n"
+                "fmla    %[r2].4s, v18.4s, %[x0a].s[3]\n"
+                "fmla    %[r3].4s, v19.4s, %[x0a].s[3]\n"
+                "fmla    %[r4].4s, v20.4s, %[x0a].s[3]\n"
+                "fmla    %[r5].4s, v21.4s, %[x0a].s[3]\n"
+                "fmla    %[r6].4s, v22.4s, %[x0a].s[3]\n"
+                "fmla    %[r7].4s, v23.4s, %[x0a].s[3]\n"
+                :
+                [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr),
+                [x0] "+w"(x0), [x0a] "+w"(x0a), [k] "+r"(k),
+                [r0] "+w"(r0), [r1] "+w"(r1), [r2] "+w"(r2), [r3] "+w"(r3),
+                [r4] "+w"(r4), [r5] "+w"(r5), [r6] "+w"(r6), [r7] "+w"(r7)
+                :
+                : "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
+                "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x20", "x21", "cc", "memory");
+        }
+
+        // Deal with ragged M
+        if(M % 8)
+        {
+            int l = (M % 8) - 1;
+
+            __asm __volatile(
+                "ldr    q2, [%[a_ptr], #0]\n"
+                "ldr    q3, [%[a_ptr], #16]\n"
+                "ldr    q4, [%[a_ptr], #32]\n"
+                "ldr    q5, [%[a_ptr], #48]\n"
+                "ldr    q6, [%[a_ptr], #64]\n"
+                "ldr    q7, [%[a_ptr], #80]\n"
+                "ldr    q8, [%[a_ptr], #96]\n"
+                "ldr    q9, [%[a_ptr], #112]\n"
+                "ldr    %s[x0], [%[x_ptr]]\n"
+                "add    %[a_ptr], %[a_ptr], #128\n"
+                "add    %[x_ptr], %[x_ptr], #4\n"
+
+                "cbz    %w[l], 2f\n"
+
+                "1:\n"
+                "fmla    %[r0].4s, v2.4s, %[x0].s[0]\n"
+                "ldr    q2, [%[a_ptr], #0]\n"
+                "subs    %w[l], %w[l], #1\n"
+                "fmla    %[r1].4s, v3.4s, %[x0].s[0]\n"
+                "ldr    q3, [%[a_ptr], #16]\n"
+                "fmla    %[r2].4s, v4.4s, %[x0].s[0]\n"
+                "ldr    q4, [%[a_ptr], #32]\n"
+                "fmla    %[r3].4s, v5.4s, %[x0].s[0]\n"
+                "ldr    q5, [%[a_ptr], #48]\n"
+                "fmla    %[r4].4s, v6.4s, %[x0].s[0]\n"
+                "ldr    q6, [%[a_ptr], #64]\n"
+                "fmla    %[r5].4s, v7.4s, %[x0].s[0]\n"
+                "ldr    q7, [%[a_ptr], #80]\n"
+                "fmla    %[r6].4s, v8.4s, %[x0].s[0]\n"
+                "ldr    q8, [%[a_ptr], #96]\n"
+                "fmla    %[r7].4s, v9.4s, %[x0].s[0]\n"
+                "ldr    q9, [%[a_ptr], #112]\n"
+                "ldr    %s[x0], [%[x_ptr]]\n"
+                "add    %[a_ptr], %[a_ptr], #128\n"
+                "add    %[x_ptr], %[x_ptr], #4\n"
+                "bne    1b\n"
+
+                "2:\n"
+
+                "fmla    %[r0].4s, v2.4s, %[x0].s[0]\n"
+                "fmla    %[r1].4s, v3.4s, %[x0].s[0]\n"
+                "fmla    %[r2].4s, v4.4s, %[x0].s[0]\n"
+                "fmla    %[r3].4s, v5.4s, %[x0].s[0]\n"
+                "fmla    %[r4].4s, v6.4s, %[x0].s[0]\n"
+                "fmla    %[r5].4s, v7.4s, %[x0].s[0]\n"
+                "fmla    %[r6].4s, v8.4s, %[x0].s[0]\n"
+                "fmla    %[r7].4s, v9.4s, %[x0].s[0]\n"
+                :
+                [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr),
+                [x0] "+w"(x0), [l] "+r"(l),
+                [r0] "+w"(r0), [r1] "+w"(r1), [r2] "+w"(r2), [r3] "+w"(r3),
+                [r4] "+w"(r4), [r5] "+w"(r5), [r6] "+w"(r6), [r7] "+w"(r7)
+                :
+                : "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc", "memory");
+        }
+
+        if(l == 32)
+        {
+            // Fast path
+            vst1q_f32(y_ptr, r0);
+            vst1q_f32(y_ptr + 4, r1);
+            vst1q_f32(y_ptr + 8, r2);
+            vst1q_f32(y_ptr + 12, r3);
+            vst1q_f32(y_ptr + 16, r4);
+            vst1q_f32(y_ptr + 20, r5);
+            vst1q_f32(y_ptr + 24, r6);
+            vst1q_f32(y_ptr + 28, r7);
+        }
+        else
+        {
+            int vecs    = l / 4;
+            int oddbits = l % 4;
+
+            if(oddbits)
+            {
+                // As above - slowest path deals with vectors plus odd bits
+                float32x4_t oddvec;
+
+                do
+                {
+                    if(vecs == 0)
+                    {
+                        oddvec = r0;
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr, r0);
+                    if(--vecs == 0)
+                    {
+                        oddvec = r1;
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 4, r1);
+                    if(--vecs == 0)
+                    {
+                        oddvec = r2;
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 8, r2);
+                    if(--vecs == 0)
+                    {
+                        oddvec = r3;
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 12, r3);
+                    if(--vecs == 0)
+                    {
+                        oddvec = r4;
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 16, r4);
+                    if(--vecs == 0)
+                    {
+                        oddvec = r5;
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 20, r5);
+                    if(--vecs == 0)
+                    {
+                        oddvec = r6;
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 24, r6);
+                    oddvec = r7;
+                }
+                while(0);
+
+                float *oddbase = y_ptr + l - oddbits;
+
+                switch(oddbits)
+                {
+                    case 3:
+                        vst1q_lane_f32(oddbase + 2, oddvec, 2);
+                    // fall through
+                    case 2:
+                        vst1q_lane_f32(oddbase + 1, oddvec, 1);
+                    // fall through
+                    case 1:
+                        vst1q_lane_f32(oddbase, oddvec, 0);
+                        break;
+
+                    default:
+                        // oddbits must be 1, 2 or 3.
+                        UNREACHABLE("Impossible case in switch.");
+                }
+            }
+            else
+            {
+                // As above - medium path deals with vectors only
+                do
+                {
+                    if(vecs == 0)
+                    {
+                        UNREACHABLE("vecs and oddbits can't both be 0");
+                    }
+
+                    vst1q_f32(y_ptr, r0);
+                    if(--vecs == 0)
+                    {
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 4, r1);
+                    if(--vecs == 0)
+                    {
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 8, r2);
+                    if(--vecs == 0)
+                    {
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 12, r3);
+                    if(--vecs == 0)
+                    {
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 16, r4);
+                    if(--vecs == 0)
+                    {
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 20, r5);
+                    if(--vecs == 0)
+                    {
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 24, r6);
+                }
+                while(0);
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // aarch64

diff --git a/src/graph/CL/CLMap.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
similarity index 61%
copy from src/graph/CL/CLMap.cpp
copy to src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
index 5289ea9..5b9bd72 100644
--- a/src/graph/CL/CLMap.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,23 +21,35 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/CL/CLMap.h"
+#pragma once
 
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#ifdef __aarch64__
 
-using namespace arm_compute::graph;
-
-CLMap::CLMap(ITensorObject *tensor, bool blocking)
-    : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor())), _blocking(blocking)
+namespace arm_gemm
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
-}
+// Actual kernel implementations
+void a64_sgemv_trans(const float *, const float *, float *, float, int, int, int);
 
-void CLMap::run()
+// Transposed SGEMV strategy class.
+class sgemv_trans
 {
-    _tensor->map(arm_compute::CLScheduler::get().queue(), _blocking);
-}
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int);
+
+    /* Kernel blocking parameters */
+    static const int out_width = 96;
+    static const int k_unroll  = 1;
+
+    kern_type kernel = a64_sgemv_trans;
+
+    sgemv_trans(const CPUInfo *ci)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp
new file mode 100644
index 0000000..8fa403b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp

@@ -0,0 +1,914 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+// Kernel implementation - transposed GEMV
+//
+// The kernel will process "M" rows of A (= steps of dot product) and "N"
+// columns (= dot products total)
+//
+// General plan is to do as many columns simultaneously as possible - a
+// reasonable limit is half the NEON regfile = 64 total accumulators.
+//
+// It's possible that messing around with sub-blocking M and N can yield
+// higher performance, but that's left to the outer loop.  In this kernel we
+// process all of M at the same time.
+
+// How far ahead to prefetch for the first and subsequent prefetches.
+// These values work for A72 on JunoR2...
+
+#define FIRST_PFD 9
+#define PFD 6
+
+namespace arm_gemm
+{
+void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, float beta, int lda, int M, int N)
+{
+    const float *a_ptr_base = Astart;
+    float       *y_ptr      = Ystart;
+
+    register const float32x4_t vb asm("v1") = vdupq_n_f32(beta);
+
+    int firstpfd = FIRST_PFD;
+    if(firstpfd > M)
+    {
+        firstpfd = (M - 1);
+    }
+
+    int pfd = PFD;
+    if(pfd > M)
+    {
+        pfd = (M - 1);
+    }
+
+    ptrdiff_t jump = lda * sizeof(int);
+
+    for(; N >= 96; N -= 96)
+    {
+        int k = M - 1;
+
+        const float *a_ptr       = a_ptr_base;
+        const float *x_ptr       = Xstart;
+        const float *pf_ptr      = a_ptr;
+        const float *firstpf_ptr = a_ptr;
+        const float *pf_limit    = a_ptr + (M * lda);
+
+        for(int i = 0; i < firstpfd; i++)
+        {
+            prefetch_1x(firstpf_ptr);
+            firstpf_ptr += lda;
+        }
+
+        for(int i = 0; i < pfd; i++)
+        {
+            prefetch_5x(pf_ptr + 16);
+            pf_ptr += lda;
+        }
+
+        a_ptr_base += 96;
+
+        __asm __volatile(
+            "movi    v8.4s,#0x0\n"
+            "ldr    w0, [%[x_ptr]]\n"
+            "movi    v9.4s,#0x0\n"
+            "ldr    q2,  [%[a_ptr], #0]\n"
+            "movi    v10.4s,#0x0\n"
+            "ldr    q3,  [%[a_ptr], #0x10]\n"
+            "movi    v11.4s,#0x0\n"
+            "ldr    q4, [%[a_ptr], #0x20]\n"
+            "movi    v12.4s,#0x0\n"
+            "ldr    q5, [%[a_ptr], #0x30]\n"
+            "movi    v13.4s,#0x0\n"
+            "ldr    q6, [%[a_ptr], #0x40]\n"
+            "movi    v14.4s,#0x0\n"
+            "ldr    q7, [%[a_ptr], #0x50]\n"
+            "movi    v15.4s,#0x0\n" ASM_PREFETCH("[%[firstpf_ptr]]")
+            "movi    v16.4s, #0x0\n"
+            "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #64]")
+            "movi    v18.4s, #0x0\n"
+            "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #128]")
+            "movi    v20.4s, #0x0\n"
+            "movi    v21.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #192]")
+            "movi    v22.4s, #0x0\n"
+            "movi    v23.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #256]")
+            "movi    v24.4s, #0x0\n"
+            "movi    v25.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #320]")
+            "movi    v26.4s, #0x0\n"
+            "movi    v27.4s, #0x0\n"
+            "add    %[pf_ptr], %[pf_ptr], %[jump]\n"
+            "movi    v28.4s, #0x0\n"
+            "add    %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
+            "movi    v29.4s, #0x0\n"
+            "movi    v30.4s, #0x0\n"
+            "movi    v31.4s, #0x0\n"
+
+            // Skip everything if there are no iterations of the main loop to do.
+            "cbz    %w[k], 10f\n"
+
+            // Loop with all prefetches.  Exit this loop when firstpf_ptr
+            // hits pf_limit.
+            "1:\n"
+            "dup    v0.4s, w0\n"
+            "ldr    w0, [%[x_ptr], #4]\n"
+            "add    %[x_ptr], %[x_ptr], #0x4\n"
+            "fmla    v8.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x60]\n"
+            "fmla    v9.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x70]\n" ASM_PREFETCH("[%[firstpf_ptr]]")
+            "fmla    v10.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x80]\n"
+            "add    %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
+            "fmla    v11.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x90]\n"
+            "sub    %w[k], %w[k], #1\n" ASM_PREFETCH("[%[x_ptr], #128]")
+            "fmla    v12.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0xa0]\n"
+            "fmla    v13.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0xb0]\n" ASM_PREFETCH("[%[pf_ptr], #0x40]")
+            "fmla    v14.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0xc0]\n"
+            "fmla    v15.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0xd0]\n"
+            "fmla    v16.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0xe0]\n"
+            "fmla    v17.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0xf0]\n" ASM_PREFETCH("[%[pf_ptr], #0x80]")
+            "fmla    v18.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0x100]\n"
+            "fmla    v19.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x110]\n"
+            "fmla    v20.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x120]\n"
+            "fmla    v21.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x130]\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]")
+            "fmla    v22.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x140]\n"
+            "fmla    v23.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x150]\n"
+            "fmla    v24.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0x160]\n"
+            "fmla    v25.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x170]\n" ASM_PREFETCH("[%[pf_ptr], #0x100]")
+            "add    %[a_ptr], %[a_ptr], %[jump]\n"
+            "fmla    v26.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x00]\n"
+            "fmla    v27.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x10]\n"
+            "fmla    v28.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x20]\n"
+            "fmla    v29.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x30]\n" ASM_PREFETCH("[%[pf_ptr], #0x140]")
+            "fmla    v30.4s, v6.4s, v0.4s\n"
+            "add    %[pf_ptr], %[pf_ptr], %[jump]\n"
+            "ldr    q6, [%[a_ptr], #0x40]\n"
+            "fmla    v31.4s, v7.4s, v0.4s\n"
+            "cmp    %[firstpf_ptr], %[pf_limit]\n"
+            "ldr    q7, [%[a_ptr], #0x50]\n"
+            "blt    1b\n"
+
+            // Check that there are still "main" prefetches to do.
+            "cmp    %[pf_ptr], %[pf_limit]\n"
+            "bge    9f\n"
+
+            // Just the main prefetches, exit this loop when pf_ptr hits pf_limit.
+            "8:\n"
+            "dup    v0.4s, w0\n"
+            "ldr    w0, [%[x_ptr], #4]\n"
+            "add    %[x_ptr], %[x_ptr], #0x4\n"
+            "fmla    v8.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x60]\n"
+            "fmla    v9.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x70]\n"
+            "fmla    v10.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x80]\n"
+            "fmla    v11.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x90]\n"
+            "sub    %w[k], %w[k], #1\n" ASM_PREFETCH("[%[x_ptr], #128]")
+            "fmla    v12.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0xa0]\n"
+            "fmla    v13.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0xb0]\n" ASM_PREFETCH("[%[pf_ptr], #0x40]")
+            "fmla    v14.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0xc0]\n"
+            "fmla    v15.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0xd0]\n"
+            "fmla    v16.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0xe0]\n"
+            "fmla    v17.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0xf0]\n" ASM_PREFETCH("[%[pf_ptr], #0x80]")
+            "fmla    v18.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0x100]\n"
+            "fmla    v19.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x110]\n"
+            "fmla    v20.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x120]\n"
+            "fmla    v21.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x130]\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]")
+            "fmla    v22.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x140]\n"
+            "fmla    v23.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x150]\n"
+            "fmla    v24.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0x160]\n"
+            "fmla    v25.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x170]\n" ASM_PREFETCH("[%[pf_ptr], #0x100]")
+            "add    %[a_ptr], %[a_ptr], %[jump]\n"
+            "fmla    v26.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x00]\n"
+            "fmla    v27.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x10]\n"
+            "fmla    v28.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x20]\n"
+            "fmla    v29.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x30]\n" ASM_PREFETCH("[%[pf_ptr], #0x140]")
+            "fmla    v30.4s, v6.4s, v0.4s\n"
+            "add    %[pf_ptr], %[pf_ptr], %[jump]\n"
+            "ldr    q6, [%[a_ptr], #0x40]\n"
+            "fmla    v31.4s, v7.4s, v0.4s\n"
+            "cmp    %[pf_ptr], %[pf_limit]\n"
+            "ldr    q7, [%[a_ptr], #0x50]\n"
+            "blt    8b\n"
+
+            // Check that there is still work to do.
+            "9:\n"
+            "cmp    %w[k], #0\n"
+            "beq    10f\n"
+
+            // Loop without prefetches, exit when k hits 0.
+            "2:\n"
+            "dup    v0.4s, w0\n"
+            "ldr    w0, [%[x_ptr], #4]\n"
+            "add    %[x_ptr], %[x_ptr], #0x4\n"
+            "fmla    v8.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x60]\n"
+            "fmla    v9.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x70]\n"
+            "fmla    v10.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x80]\n"
+            "fmla    v11.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x90]\n"
+            "subs    %w[k], %w[k], #1\n"
+            "fmla    v12.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0xa0]\n"
+            "fmla    v13.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0xb0]\n"
+            "fmla    v14.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0xc0]\n"
+            "fmla    v15.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0xd0]\n"
+            "fmla    v16.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0xe0]\n"
+            "fmla    v17.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0xf0]\n"
+            "fmla    v18.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0x100]\n"
+            "fmla    v19.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x110]\n"
+            "fmla    v20.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x120]\n"
+            "fmla    v21.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x130]\n"
+            "fmla    v22.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x140]\n"
+            "fmla    v23.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x150]\n"
+            "fmla    v24.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0x160]\n"
+            "fmla    v25.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x170]\n"
+            "add    %[a_ptr], %[a_ptr], %[jump]\n"
+            "fmla    v26.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x00]\n"
+            "fmla    v27.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x10]\n"
+            "fmla    v28.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x20]\n"
+            "fmla    v29.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x30]\n"
+            "fmla    v30.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0x40]\n"
+            "fmla    v31.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x50]\n"
+            "bne    2b\n"
+
+            "10:\n"
+
+            // Final iteration
+            "dup    v0.4s, w0\n"
+            "fmla    v8.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x60]\n"
+            "fmla    v9.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x70]\n"
+            "fmla    v10.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x80]\n"
+            "fmla    v11.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x90]\n"
+            "fmla    v12.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0xa0]\n"
+            "fmla    v13.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0xb0]\n"
+            "fmla    v14.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0xc0]\n"
+            "fmla    v15.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0xd0]\n"
+            "fmla    v16.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0xe0]\n"
+            "fmla    v17.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0xf0]\n"
+            "fmla    v18.4s, v6.4s, v0.4s\n"
+
+            "ldr    q6, [%[a_ptr], #0x100]\n"
+            "fmla    v19.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x110]\n"
+            "fmla    v20.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x120]\n"
+            "fmla    v21.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x130]\n"
+            "fmla    v22.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x140]\n"
+            "fmla    v23.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x150]\n"
+            "fmla    v24.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0x160]\n"
+            "fmla    v25.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x170]\n"
+            "fmla    v26.4s, v2.4s, v0.4s\n"
+            "ldr    q2,  [%[y_ptr]]\n"
+            "fmla    v27.4s, v3.4s, v0.4s\n"
+            "ldr    q3,  [%[y_ptr], #0x10]\n"
+            "fmla    v28.4s, v4.4s, v0.4s\n"
+            "ldr    q4,  [%[y_ptr], #0x20]\n"
+            "fmla    v29.4s, v5.4s, v0.4s\n"
+            "ldr    q5,  [%[y_ptr], #0x30]\n"
+            "fmla    v30.4s, v6.4s, v0.4s\n"
+            "ldr    q6,  [%[y_ptr], #0x40]\n"
+            "fmla    v31.4s, v7.4s, v0.4s\n"
+            "ldr    q7,  [%[y_ptr], #0x50]\n"
+
+            "fmla    v8.4s, v2.4s, %[vb].4s\n"
+            "ldr    q2, [%[y_ptr], #0x60]\n"
+            "fmla    v9.4s, v3.4s, %[vb].4s\n"
+            "ldr    q3, [%[y_ptr], #0x70]\n"
+            "fmla    v10.4s, v4.4s, %[vb].4s\n"
+            "ldr    q4, [%[y_ptr], #0x80]\n"
+            "fmla    v11.4s, v5.4s, %[vb].4s\n"
+            "ldr    q5, [%[y_ptr], #0x90]\n"
+            "fmla    v12.4s, v6.4s, %[vb].4s\n"
+            "ldr    q6, [%[y_ptr], #0xa0]\n"
+            "str    q8, [%[y_ptr], #0x00]\n"
+            "fmla    v13.4s, v7.4s, %[vb].4s\n"
+            "ldr    q7, [%[y_ptr], #0xb0]\n"
+            "str    q9, [%[y_ptr], #0x10]\n"
+            "fmla    v14.4s, v2.4s, %[vb].4s\n"
+            "ldr    q2, [%[y_ptr], #0xc0]\n"
+            "str    q10, [%[y_ptr], #0x20]\n"
+            "fmla    v15.4s, v3.4s, %[vb].4s\n"
+            "ldr    q3, [%[y_ptr], #0xd0]\n"
+            "str    q11, [%[y_ptr], #0x30]\n"
+            "fmla    v16.4s, v4.4s, %[vb].4s\n"
+            "ldr    q4, [%[y_ptr], #0xe0]\n"
+            "str    q12, [%[y_ptr], #0x40]\n"
+            "fmla    v17.4s, v5.4s, %[vb].4s\n"
+            "ldr    q5, [%[y_ptr], #0xf0]\n"
+            "str    q13, [%[y_ptr], #0x50]\n"
+            "fmla    v18.4s, v6.4s, %[vb].4s\n"
+            "ldr    q6, [%[y_ptr], #0x100]\n"
+            "str    q14, [%[y_ptr], #0x60]\n"
+            "fmla    v19.4s, v7.4s, %[vb].4s\n"
+            "ldr    q7, [%[y_ptr], #0x110]\n"
+            "str    q15, [%[y_ptr], #0x70]\n"
+            "fmla    v20.4s, v2.4s, %[vb].4s\n"
+            "ldr    q2, [%[y_ptr], #0x120]\n"
+            "str    q16, [%[y_ptr], #0x80]\n"
+            "fmla    v21.4s, v3.4s, %[vb].4s\n"
+            "ldr    q3, [%[y_ptr], #0x130]\n"
+            "str    q17, [%[y_ptr], #0x90]\n"
+            "fmla    v22.4s, v4.4s, %[vb].4s\n"
+            "ldr    q4, [%[y_ptr], #0x140]\n"
+            "str    q18, [%[y_ptr], #0xa0]\n"
+            "fmla    v23.4s, v5.4s, %[vb].4s\n"
+            "ldr    q5, [%[y_ptr], #0x150]\n"
+            "str    q19, [%[y_ptr], #0xb0]\n"
+            "fmla    v24.4s, v6.4s, %[vb].4s\n"
+            "ldr    q6, [%[y_ptr], #0x160]\n"
+            "str    q20, [%[y_ptr], #0xc0]\n"
+            "fmla    v25.4s, v7.4s, %[vb].4s\n"
+            "ldr    q7, [%[y_ptr], #0x170]\n"
+            "str    q21, [%[y_ptr], #0xd0]\n"
+            "fmla    v26.4s, v2.4s, %[vb].4s\n"
+            "str    q22, [%[y_ptr], #0xe0]\n"
+            "fmla    v27.4s, v3.4s, %[vb].4s\n"
+            "str    q23, [%[y_ptr], #0xf0]\n"
+            "fmla    v28.4s, v4.4s, %[vb].4s\n"
+            "str    q24, [%[y_ptr], #0x100]\n"
+            "fmla    v29.4s, v5.4s, %[vb].4s\n"
+            "str    q25, [%[y_ptr], #0x110]\n"
+            "fmla    v30.4s, v6.4s, %[vb].4s\n"
+            "str    q26, [%[y_ptr], #0x120]\n"
+            "fmla    v31.4s, v7.4s, %[vb].4s\n"
+            "str    q27, [%[y_ptr], #0x130]\n"
+
+            "stp    q28, q29, [%[y_ptr], #0x140]\n"
+            "stp    q30, q31, [%[y_ptr], #0x160]\n"
+            "add    %[y_ptr], %[y_ptr], #0x180\n"
+
+            : [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr), [y_ptr] "+r"(y_ptr), [k] "+r"(k), [pf_ptr] "+r"(pf_ptr), [firstpf_ptr] "+r"(firstpf_ptr)
+            : [jump] "r"(jump), [vb] "w"(vb), [pf_limit] "r"(pf_limit)
+            : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
+            "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+            "v27", "v28", "v29", "v30", "v31", "cc");
+    }
+
+    if(N > 0)
+    {
+        // Handle N tail - up to 95 stragglers.
+        // This is 0-23 vectors, plus optionally an 64-bit vector and/or a
+        // single value for the remainder.
+
+        // Independent pointers into the matrix for the odd 2 and odd 1.
+        // Double up as flag to indicate whether they are needed.
+        const float *odd2_aptr = NULL;
+        const float *odd1_aptr = NULL;
+
+        // Figure out how much work we need to do.
+        int numvecs = N / 4;
+        int rem     = N % 4;
+        int k       = M;
+
+        // Set up pointers for the odd 2/1 if needed.
+        if(rem >= 2)
+        {
+            odd2_aptr = a_ptr_base + (numvecs * 4);
+        }
+
+        if(rem & 1)
+        {
+            odd1_aptr = a_ptr_base + (numvecs * 4) + (odd2_aptr == NULL ? 0 : 2);
+        }
+
+        const float *a_ptr       = a_ptr_base;
+        const float *firstpf_ptr = a_ptr_base;
+        const float *pf_ptr      = a_ptr_base;
+        const float *pf_limit    = a_ptr + (M * lda);
+
+        const float *x_ptr = Xstart;
+        int          vecs  = 0; // Working variable to count how many vectors to work on.
+        int          dopf  = 1; // Track whether we are doing prefetches.
+
+        // Figure out how many cache lines we need to prefetch each time.
+        int numpfs = (N + 15) / 16;
+
+        // Do initial prefetches
+        for(int i = 0; i < firstpfd + 1; i++)
+        {
+            prefetch_1x(firstpf_ptr);
+            firstpf_ptr += lda;
+        }
+
+        // Do "main" prefetches - adapt number to the number we actually need.
+        if(numpfs > 1)
+        {
+            for(int i = 0; i < pfd + 1; i++)
+            {
+                switch(numpfs)
+                {
+                    case 2:
+                        prefetch_1x(pf_ptr + 16);
+                        break;
+
+                    case 3:
+                        prefetch_2x(pf_ptr + 16);
+                        break;
+
+                    case 4:
+                        prefetch_3x(pf_ptr + 16);
+                        break;
+
+                    case 5:
+                        prefetch_4x(pf_ptr + 16);
+                        break;
+
+                    case 6:
+                        prefetch_5x(pf_ptr + 16);
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+                pf_ptr += lda;
+            }
+        }
+        else
+        {
+            // Just disable additional prefetches
+            dopf = 0;
+        }
+
+        // Do the real work
+        __asm __volatile(
+            // Initialize all the vectors - not worth skipping this if only
+            // some are needed.
+            "movi    v8.4s,#0x0\n"
+            "ldr    w0, [%[x_ptr]]\n"
+            "movi    v9.4s,#0x0\n"
+            "movi    v10.4s,#0x0\n"
+            "movi    v11.4s,#0x0\n"
+            "movi    v12.4s,#0x0\n"
+            "movi    v13.4s,#0x0\n"
+            "movi    v14.4s,#0x0\n"
+            "movi    v15.4s,#0x0\n"
+            "movi    v16.4s, #0x0\n"
+            "movi    v17.4s, #0x0\n"
+            "movi    v18.4s, #0x0\n"
+            "movi    v19.4s, #0x0\n"
+            "movi    v20.4s, #0x0\n"
+            "movi    v21.4s, #0x0\n"
+            "movi    v22.4s, #0x0\n"
+            "movi    v23.4s, #0x0\n"
+            "movi    v24.4s, #0x0\n"
+            "movi    v25.4s, #0x0\n"
+            "movi    v26.4s, #0x0\n"
+            "movi    v27.4s, #0x0\n"
+            "movi    v28.4s, #0x0\n"
+            "movi    v29.4s, #0x0\n"
+            "movi    v30.4s, #0x0\n"
+            "movi    v6.2s, #0x0\n"
+            "movi    v5.2s, #0x0\n"
+
+            "1:\n" ASM_PREFETCH("[%[firstpf_ptr]]\n")
+            "11:\n"
+            "dup    v0.4s, w0\n"
+            "ldr    w0, [%[x_ptr], #4]\n"
+            "add    %[x_ptr], %[x_ptr], #4\n"
+
+            "cbz    %w[numvecs], 2f\n"
+            "mov    %w[vecs], %w[numvecs]\n"
+
+            // Vector 0
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x00]\n"
+            "fmla    v8.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 1
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x10]\n"
+            "fmla    v9.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 2
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x20]\n"
+            "fmla    v10.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 3
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x30]\n"
+            "fmla    v11.4s, v7.4s, v0.4s\n"
+            // Prefetch
+            "cbz    %w[dopf], 3f\n" ASM_PREFETCH("[%[pf_ptr], #0x40]")
+            "3:\n"
+            "beq    2f\n"
+
+            // Vector 4
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x40]\n"
+            "fmla    v12.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 5
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x50]\n"
+            "fmla    v13.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 6
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x60]\n"
+            "fmla    v14.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 7
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x70]\n"
+            "fmla    v15.4s, v7.4s, v0.4s\n"
+            // Prefetch
+            "cbz    %w[dopf], 4f\n" ASM_PREFETCH("[%[pf_ptr], #0x80]")
+            "4:\n"
+            "beq    2f\n"
+
+            // Vector 8
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x80]\n"
+            "fmla    v16.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 9
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x90]\n"
+            "fmla    v17.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 10
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0xa0]\n"
+            "fmla    v18.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 11
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0xb0]\n"
+            "fmla    v19.4s, v7.4s, v0.4s\n"
+            // Prefetch
+            "cbz    %w[dopf], 5f\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]")
+            "5:\n"
+            "beq    2f\n"
+
+            // Vector 12
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0xc0]\n"
+            "fmla    v20.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 13
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0xd0]\n"
+            "fmla    v21.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 14
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0xe0]\n"
+            "fmla    v22.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 15
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0xf0]\n"
+            "fmla    v23.4s, v7.4s, v0.4s\n"
+            // Prefetch
+            "cbz    %w[dopf], 6f\n" ASM_PREFETCH("[%[pf_ptr], #0x100]")
+            "6:\n"
+            "beq    2f\n"
+
+            // Vector 16
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x100]\n"
+            "fmla    v24.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 17
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x110]\n"
+            "fmla    v25.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 18
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x120]\n"
+            "fmla    v26.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 19
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x130]\n"
+            "fmla    v27.4s, v7.4s, v0.4s\n"
+            // Prefetch
+            "cbz    %w[dopf], 7f\n" ASM_PREFETCH("[%[pf_ptr], #0x140]")
+            "7:\n"
+            "beq    2f\n"
+
+            // Vector 20
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x140]\n"
+            "fmla    v28.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 21
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x150]\n"
+            "fmla    v29.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 22
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x160]\n"
+            "fmla    v30.4s, v7.4s, v0.4s\n"
+
+            "2:\n"
+            "add    %[a_ptr], %[a_ptr], %[jump]\n"
+
+            // Do the odd 2-vector, if needed
+            "cbz    %[odd2_aptr], 8f\n"
+            "ldr    d7, [%[odd2_aptr]]\n"
+            "fmla    v6.2s, v7.2s, v0.2s\n"
+            "add    %[odd2_aptr], %[odd2_aptr], %[jump]\n"
+
+            "8:\n"
+            // Do the odd 1-vector, if needed
+            "cbz    %[odd1_aptr], 9f\n"
+            "ldr    s7, [%[odd1_aptr]]\n"
+            "fmla    v5.2s, v7.2s, v0.2s\n"
+            "add    %[odd1_aptr], %[odd1_aptr], %[jump]\n"
+
+            // Get out if needed.
+            "9:\n"
+            "subs    %w[k], %w[k], #1\n"
+            "beq    10f\n"
+
+            // Update the "main" prefetch pointer, if it strays beyond the limit turn off "dopf"
+            "add    %[pf_ptr], %[pf_ptr], %[jump]\n"
+            "cmp    %[pf_ptr], %[pf_limit]\n"
+            "csel    %w[dopf], %w[dopf], WZR, LT\n"
+
+            // Update the "leading" prefetch pointer, don't do the first
+            // instruction of the loop if it's over the limit.
+            "add    %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
+            "cmp    %[firstpf_ptr], %[pf_limit]\n"
+            "blt    1b\n"
+            "b        11b\n"
+
+            // Now write out the outputs
+            "10:\n"
+            "cbz    %w[numvecs], 12f\n"
+            "mov    %w[vecs], %w[numvecs]\n"
+
+            // Vector 0
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v8.4s, v7.4s, %[vb].4s\n"
+            "str    q8, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 1
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v9.4s, v7.4s, %[vb].4s\n"
+            "str    q9, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 2
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v10.4s, v7.4s, %[vb].4s\n"
+            "str    q10, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 3
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v11.4s, v7.4s, %[vb].4s\n"
+            "str    q11, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 4
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v12.4s, v7.4s, %[vb].4s\n"
+            "str    q12, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 5
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v13.4s, v7.4s, %[vb].4s\n"
+            "str    q13, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 6
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v14.4s, v7.4s, %[vb].4s\n"
+            "str    q14, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 7
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v15.4s, v7.4s, %[vb].4s\n"
+            "str    q15, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 8
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v16.4s, v7.4s, %[vb].4s\n"
+            "str    q16, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 9
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v17.4s, v7.4s, %[vb].4s\n"
+            "str    q17, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 10
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v18.4s, v7.4s, %[vb].4s\n"
+            "str    q18, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 11
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v19.4s, v7.4s, %[vb].4s\n"
+            "str    q19, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 12
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v20.4s, v7.4s, %[vb].4s\n"
+            "str    q20, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 13
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v21.4s, v7.4s, %[vb].4s\n"
+            "str    q21, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 14
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v22.4s, v7.4s, %[vb].4s\n"
+            "str    q22, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 15
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v23.4s, v7.4s, %[vb].4s\n"
+            "str    q23, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 16
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v24.4s, v7.4s, %[vb].4s\n"
+            "str    q24, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 17
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v25.4s, v7.4s, %[vb].4s\n"
+            "str    q25, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 18
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v26.4s, v7.4s, %[vb].4s\n"
+            "str    q26, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 19
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v27.4s, v7.4s, %[vb].4s\n"
+            "str    q27, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 20
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v28.4s, v7.4s, %[vb].4s\n"
+            "str    q28, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 21
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v29.4s, v7.4s, %[vb].4s\n"
+            "str    q29, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 22
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v30.4s, v7.4s, %[vb].4s\n"
+            "str    q30, [%[y_ptr]], #0x10\n"
+
+            // Odd 2
+            "12:\n"
+            "cbz    %[odd2_aptr], 13f\n"
+            "ldr    d7, [%[y_ptr]]\n"
+            "fmla    v6.2s, v7.2s, %[vb].2s\n"
+            "str    d6, [%[y_ptr]], #0x8\n"
+
+            // Odd 1
+            "13:\n"
+            "cbz    %[odd1_aptr], 14f\n"
+            "ldr    s7, [%[y_ptr]]\n"
+            "fmla    v5.2s, v7.2s, %[vb].2s\n"
+            "str    s5, [%[y_ptr]]\n"
+
+            "14:\n"
+            : [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr), [y_ptr] "+r"(y_ptr), [k] "+r"(k),
+            [pf_ptr] "+r"(pf_ptr), [firstpf_ptr] "+r"(firstpf_ptr),
+            [odd1_aptr] "+r"(odd1_aptr), [odd2_aptr] "+r"(odd2_aptr),
+            [dopf] "+r"(dopf), [vecs] "+r"(vecs)
+            : [jump] "r"(jump), [vb] "w"(vb), [pf_limit] "r"(pf_limit), [numvecs] "r"(numvecs)
+            : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
+            "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+            "v27", "v28", "v29", "v30", "v31", "cc");
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults.hpp b/src/core/NEON/kernels/arm_gemm/mergeresults.hpp
new file mode 100644
index 0000000..4a6da3d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/mergeresults.hpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+/* As some of the merges need these headers, but are all included in the
+ * arm_gemm namespace, put these headers here.  */
+#include <arm_neon.h>
+
+#include "asmlib.hpp"
+#include "utils.hpp"
+
+namespace arm_gemm
+{
+template <unsigned int width, unsigned int height, typename Tin, typename Tout>
+inline void MergeResults(Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, int xmax, const Tout alpha, const Tout beta)
+{
+    int full_y_blocks = (ymax - y0) / height;
+    int y_remainder   = (ymax - y0) % height;
+    int y_blocks      = full_y_blocks + (y_remainder ? 1 : 0);
+
+    int full_x_blocks = (xmax - x0) / width;
+    int x_remainder   = (xmax - x0) % width;
+    int x_blocks      = full_x_blocks + (x_remainder ? 1 : 0);
+
+    for(int y_block = 0; y_block < y_blocks; y_block++)
+    {
+        int ybase = y0 + (y_block * height);
+
+        int fill_rows = (y_block < full_y_blocks) ? height : y_remainder;
+
+        for(int x_block = 0; x_block < x_blocks; x_block++)
+        {
+            int xbase = x0 + (x_block * width);
+
+            int fill_cols = (x_block < full_x_blocks) ? width : x_remainder;
+
+            for(int row = 0; row < fill_rows; row++)
+            {
+                for(int col = 0; col < fill_cols; col++)
+                {
+                    Tout &p = out[(ybase + row) * ldc + xbase + col];
+
+                    p = (p * beta) + (alpha * in[row * width + col]);
+                }
+            }
+
+            in += (width * height);
+        }
+    }
+}
+
+#include "merges/list.hpp"
+
+} // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
new file mode 100644
index 0000000..b44e564
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp

@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+template <>
+inline void MergeResults<8, 6>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
+{
+    const float *inptr = in;
+    prefetch_6x(inptr);
+    prefetch_6x(inptr + 96);
+
+    float32x4_t av = vdupq_n_f32(alpha);
+    float32x4_t bv = vdupq_n_f32(beta);
+
+    for(int y = y0; y < ymax; y += 8)
+    {
+        float *outptr0 = out + (y * ldout) + x0;
+        float *outptr1 = outptr0 + ldout;
+        float *outptr2 = outptr1 + ldout;
+        float *outptr3 = outptr2 + ldout;
+        float *outptr4 = outptr3 + ldout;
+        float *outptr5 = outptr4 + ldout;
+
+        prefetch_2x(outptr0);
+        prefetch_2x(outptr1);
+        prefetch_2x(outptr2);
+        prefetch_2x(outptr3);
+        prefetch_2x(outptr4);
+        prefetch_2x(outptr5);
+
+        for(int i = x0; i < xmax; i += 8)
+        {
+            float dummyres[8];
+
+            /* Make sure we throw away results if Y isn't a multiple of 8.
+             * We do this by pointing the result pointer at a dummy buffer
+             * we later discard.  */
+            if((y + 5) >= ymax)
+            {
+                switch((y + 5) - ymax)
+                {
+                    case 4:
+                        outptr1 = dummyres;
+                    case 3:
+                        outptr2 = dummyres;
+                    case 2:
+                        outptr3 = dummyres;
+                    case 1:
+                        outptr4 = dummyres;
+                    case 0:
+                        outptr5 = dummyres;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            /* For ragged X, manually copy over the valid results. */
+            if((i + 7) >= xmax)
+            {
+                for(int xi = 0; xi < 8; xi++)
+                {
+                    if((i + xi) < xmax)
+                    {
+                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                        outptr0++;
+                        *outptr1 = (alpha * inptr[xi + 8]) + (*outptr1 * beta);
+                        outptr1++;
+                        *outptr2 = (alpha * inptr[xi + 16]) + (*outptr2 * beta);
+                        outptr2++;
+                        *outptr3 = (alpha * inptr[xi + 24]) + (*outptr3 * beta);
+                        outptr3++;
+                        *outptr4 = (alpha * inptr[xi + 32]) + (*outptr4 * beta);
+                        outptr4++;
+                        *outptr5 = (alpha * inptr[xi + 40]) + (*outptr5 * beta);
+                        outptr5++;
+                    }
+                }
+                inptr += 48;
+            }
+            else
+            {
+                /* Optimized routine to copy an entire block */
+                __asm __volatile(
+                    // Rows 0-1
+                    "VLD1.32    {d8-d11},  [%[outptr0]]\n"
+                    "VMUL.f32    q4, q4, %q[bv]\n"
+                    "VLD1.32    {d12-d15}, [%[outptr1]]\n"
+                    "VMUL.f32    q5, q5, %q[bv]\n"
+                    "VLD1.32    {d0-d3},   [%[inptr]]!\n"
+                    "VMUL.f32    q6, q6, %q[bv]\n"
+                    "VLD1.32    {d4-d7},   [%[inptr]]!\n"
+                    "VMUL.f32    q7, q7, %q[bv]\n"
+
+                    "VMLA.f32    q4, q0, %q[av]\n" ASM_PREFETCH("[%[inptr], #352]")
+                    "VMLA.f32    q5, q1, %q[av]\n"
+                    "VST1.32    {d8-d11}, [%[outptr0]]!\n" ASM_PREFETCH("[%[inptr], #416]") "VMLA.f32    q6, q2, %q[av]\n" ASM_PREFETCH("[%[inptr], #480]")
+                    "VMLA.f32    q7, q3, %q[av]\n"
+                    "VST1.32    {d12-d15}, [%[outptr1]]!\n"
+
+                    // Rows 2-3
+                    "VLD1.32    {d8-d11},  [%[outptr2]]\n"
+                    "VMUL.f32    q4, q4, %q[bv]\n"
+                    "VLD1.32    {d12-d15}, [%[outptr3]]\n"
+                    "VMUL.f32    q5, q5, %q[bv]\n"
+                    "VLD1.32    {d0-d3},   [%[inptr]]!\n"
+                    "VMUL.f32    q6, q6, %q[bv]\n"
+                    "VLD1.32    {d4-d7},   [%[inptr]]!\n"
+                    "VMUL.f32    q7, q7, %q[bv]\n"
+
+                    "VMLA.f32    q4, q0, %q[av]\n" ASM_PREFETCH("[%[outptr0], #96]")
+                    "VMLA.f32    q5, q1, %q[av]\n"
+                    "VST1.32    {d8-d11}, [%[outptr2]]!\n" ASM_PREFETCH("[%[outptr1], #96]") "VMLA.f32    q6, q2, %q[av]\n" ASM_PREFETCH("[%[outptr2], #96]")
+                    "VMLA.f32    q7, q3, %q[av]\n"
+                    "VST1.32    {d12-d15}, [%[outptr3]]!\n"
+
+                    // Rows 4-5
+                    "VLD1.32    {d8-d11},  [%[outptr4]]\n"
+                    "VMUL.f32    q4, q4, %q[bv]\n"
+                    "VLD1.32    {d12-d15}, [%[outptr5]]\n"
+                    "VMUL.f32    q5, q5, %q[bv]\n"
+                    "VLD1.32    {d0-d3},   [%[inptr]]!\n"
+                    "VMUL.f32    q6, q6, %q[bv]\n"
+                    "VLD1.32    {d4-d7},   [%[inptr]]!\n"
+                    "VMUL.f32    q7, q7, %q[bv]\n"
+
+                    "VMLA.f32    q4, q0, %q[av]\n" ASM_PREFETCH("[%[outptr3], #96]")
+                    "VMLA.f32    q5, q1, %q[av]\n"
+                    "VST1.32    {d8-d11}, [%[outptr4]]!\n" ASM_PREFETCH("[%[outptr4], #96]") "VMLA.f32    q6, q2, %q[av]\n" ASM_PREFETCH("[%[outptr5], #128]")
+                    "VMLA.f32    q7, q3, %q[av]\n"
+                    "VST1.32    {d12-d15}, [%[outptr5]]!\n"
+                    : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
+                    [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [inptr] "+r"(inptr)
+                    : [av] "w"(av), [bv] "w"(bv)
+                    : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+            }
+        }
+    }
+}
+
+#endif // __arm__

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp
new file mode 100644
index 0000000..3b59a43
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp

@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+template <>
+inline void MergeResults<12, 8>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
+{
+    const float *inptr = in;
+    prefetch_6x(inptr);
+    prefetch_6x(inptr + 96);
+
+    float32x4_t av = vdupq_n_f32(alpha);
+    float32x4_t bv = vdupq_n_f32(beta);
+
+    for(int y = y0; y < ymax; y += 8)
+    {
+        float *outptr0 = out + (y * ldout) + x0;
+        float *outptr1 = outptr0 + ldout;
+        float *outptr2 = outptr1 + ldout;
+        float *outptr3 = outptr2 + ldout;
+        float *outptr4 = outptr3 + ldout;
+        float *outptr5 = outptr4 + ldout;
+        float *outptr6 = outptr5 + ldout;
+        float *outptr7 = outptr6 + ldout;
+
+        prefetch_2x(outptr0);
+        prefetch_2x(outptr1);
+        prefetch_2x(outptr2);
+        prefetch_2x(outptr3);
+        prefetch_2x(outptr4);
+        prefetch_2x(outptr5);
+        prefetch_2x(outptr6);
+        prefetch_2x(outptr7);
+
+        for(int i = x0; i < xmax; i += 12)
+        {
+            float dummyres[12];
+
+            /* Make sure we throw away results if Y isn't a multiple of 8.
+             * We do this by pointing the result pointer at a dummy buffer
+             * we later discard.  */
+            if((y + 7) >= ymax)
+            {
+                switch((y + 7) - ymax)
+                {
+                    case 6:
+                        outptr1 = dummyres;
+                    case 5:
+                        outptr2 = dummyres;
+                    case 4:
+                        outptr3 = dummyres;
+                    case 3:
+                        outptr4 = dummyres;
+                    case 2:
+                        outptr5 = dummyres;
+                    case 1:
+                        outptr6 = dummyres;
+                    case 0:
+                        outptr7 = dummyres;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            /* For ragged X, manually copy over the valid results. */
+            if((i + 11) >= xmax)
+            {
+                for(int xi = 0; xi < 12; xi++)
+                {
+                    if((i + xi) < xmax)
+                    {
+                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                        outptr0++;
+                        *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+                        outptr1++;
+                        *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+                        outptr2++;
+                        *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+                        outptr3++;
+                        *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+                        outptr4++;
+                        *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+                        outptr5++;
+                        *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+                        outptr6++;
+                        *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+                        outptr7++;
+                    }
+                }
+                inptr += 96;
+            }
+            else
+            {
+                /* Optimized routine to copy an entire block */
+                __asm __volatile(
+                    // Rows 0-1
+                    "LDP    q16, q17, [%[outptr0]]\n"
+                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
+                    "LDR    q18, [%[outptr0], #32]\n"
+                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
+                    "LDP    q19, q20, [%[outptr1]]\n"
+                    "FMUL    v18.4s, v18.4s, %[bv].4s\n"
+                    "LDR    q21, [%[outptr1], #32]\n" ASM_PREFETCH("[%[inptr], #768]")
+                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
+                    "LDP    q0,  q1,  [%[inptr]]\n"
+                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
+                    "LDP    q2,  q3,  [%[inptr], #32]\n"
+                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
+                    "LDP    q4,  q5,  [%[inptr], #64]\n"
+                    "FMLA    v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #832]")
+                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
+                    "STP    q16, q17, [%[outptr0]], #32\n"
+                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
+                    "STR    q18, [%[outptr0]], #16\n"
+                    "FMLA    v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #896]")
+                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
+                    "STP    q19, q20, [%[outptr1]], #32\n"
+                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
+                    "STR    q21, [%[outptr1]], #16\n"
+
+                    // Rows 2-3
+                    "LDP    q16, q17, [%[outptr2]]\n"
+                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
+                    "LDR    q18, [%[outptr2], #32]\n"
+                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
+                    "LDP    q19, q20, [%[outptr3]]\n"
+                    "FMUL    v18.4s, v18.4s, %[bv].4s\n"
+                    "LDR    q21, [%[outptr3], #32]\n" ASM_PREFETCH("[%[inptr], #960]")
+                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
+                    "LDP    q0,  q1,  [%[inptr], #96]\n"
+                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
+                    "LDP    q2,  q3,  [%[inptr], #128]\n"
+                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
+                    "LDP    q4,  q5,  [%[inptr], #160]\n"
+                    "FMLA    v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #1024]")
+                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
+                    "STP    q16, q17, [%[outptr2]], #32\n"
+                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
+                    "STR    q18, [%[outptr2]], #16\n"
+                    "FMLA    v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #1088]")
+                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
+                    "STP    q19, q20, [%[outptr3]], #32\n"
+                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
+                    "STR    q21, [%[outptr3]], #16\n"
+
+                    // Rows 4-5
+                    ASM_PREFETCH("[%[outptr0], #80]")
+                    "LDP    q16, q17, [%[outptr4]]\n"
+                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
+                    "LDR    q18, [%[outptr4], #32]\n"
+                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
+                    "LDP    q19, q20, [%[outptr5]]\n"
+                    "FMUL    v18.4s, v18.4s, %[bv].4s\n"
+                    "LDR    q21, [%[outptr5], #32]\n" ASM_PREFETCH("[%[outptr1], #80]")
+                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
+                    "LDP    q0,  q1,  [%[inptr], #192]\n"
+                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
+                    "LDP    q2,  q3,  [%[inptr], #224]\n"
+                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
+                    "LDP    q4,  q5,  [%[inptr], #256]\n"
+                    "FMLA    v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr2], #80]")
+                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
+                    "STP    q16, q17, [%[outptr4]], #32\n"
+                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
+                    "STR    q18, [%[outptr4]], #16\n"
+                    "FMLA    v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr3], #80]")
+                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
+                    "STP    q19, q20, [%[outptr5]], #32\n"
+                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
+                    "STR    q21, [%[outptr5]], #16\n"
+
+                    // Rows 6-7
+                    ASM_PREFETCH("[%[outptr4], #80]")
+                    "LDP    q16, q17, [%[outptr6]]\n"
+                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
+                    "LDR    q18, [%[outptr6], #32]\n"
+                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
+                    "LDP    q19, q20, [%[outptr7]]\n"
+                    "FMUL    v18.4s, v18.4s, %[bv].4s\n"
+                    "LDR    q21, [%[outptr7], #32]\n" ASM_PREFETCH("[%[outptr5], #80]")
+                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
+                    "LDP    q0,  q1,  [%[inptr], #288]\n"
+                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
+                    "LDP    q2,  q3,  [%[inptr], #320]\n"
+                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
+                    "LDP    q4,  q5,  [%[inptr], #352]\n"
+                    "FMLA    v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr6], #128]")
+                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
+                    "STP    q16, q17, [%[outptr6]], #32\n"
+                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
+                    "STR    q18, [%[outptr6]], #16\n"
+                    "FMLA    v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr7], #128]")
+                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
+                    "STP    q19, q20, [%[outptr7]], #32\n"
+                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
+                    "STR    q21, [%[outptr7]], #16\n"
+                    "ADD    %[inptr], %[inptr], #384\n"
+                    : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
+                    [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7),
+                    [inptr] "+r"(inptr)
+                    : [av] "w"(av), [bv] "w"(bv)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21");
+            }
+        }
+    }
+}
+
+#endif // __aarch64__
\ No newline at end of file

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp
new file mode 100644
index 0000000..9708fe1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp

@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+// This should be possible on any AArch64 target, but some old compilers don't support __fp16 arguments.
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
+
+#include <arm_neon.h>
+
+template <>
+inline void MergeResults<12, 8>(__fp16 *out, const float *in, int ldout, int y0, int ymax, int x0, int xmax, const __fp16 alpha, const __fp16 beta)
+{
+    const float *inptr = in;
+    prefetch_6x(inptr);
+    prefetch_6x(inptr + 24);
+
+    float32x4_t av = vdupq_n_f32(alpha);
+    float32x4_t bv = vdupq_n_f32(beta);
+
+    for(int y = y0; y < ymax; y += 8)
+    {
+        __fp16 *outptr0 = out + (y * ldout) + x0;
+        __fp16 *outptr1 = outptr0 + ldout;
+        __fp16 *outptr2 = outptr1 + ldout;
+        __fp16 *outptr3 = outptr2 + ldout;
+        __fp16 *outptr4 = outptr3 + ldout;
+        __fp16 *outptr5 = outptr4 + ldout;
+        __fp16 *outptr6 = outptr5 + ldout;
+        __fp16 *outptr7 = outptr6 + ldout;
+
+        prefetch_2x(outptr0);
+        prefetch_2x(outptr1);
+        prefetch_2x(outptr2);
+        prefetch_2x(outptr3);
+        prefetch_2x(outptr4);
+        prefetch_2x(outptr5);
+        prefetch_2x(outptr6);
+        prefetch_2x(outptr7);
+
+        for(int i = x0; i < xmax; i += 12)
+        {
+            __fp16 dummyres[12];
+
+            /* Make sure we throw away results if Y isn't a multiple of 8.
+             * We do this by pointing the result pointer at a dummy buffer
+             * we later discard.  */
+            if((y + 7) >= ymax)
+            {
+                switch((y + 7) - ymax)
+                {
+                    case 6:
+                        outptr1 = dummyres;
+                    case 5:
+                        outptr2 = dummyres;
+                    case 4:
+                        outptr3 = dummyres;
+                    case 3:
+                        outptr4 = dummyres;
+                    case 2:
+                        outptr5 = dummyres;
+                    case 1:
+                        outptr6 = dummyres;
+                    case 0:
+                        outptr7 = dummyres;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            /* For ragged X, manually copy over the valid results. */
+            if((i + 11) >= xmax)
+            {
+                for(int xi = 0; xi < 12; xi++)
+                {
+                    if((i + xi) < xmax)
+                    {
+                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                        outptr0++;
+                        *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+                        outptr1++;
+                        *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+                        outptr2++;
+                        *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+                        outptr3++;
+                        *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+                        outptr4++;
+                        *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+                        outptr5++;
+                        *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+                        outptr6++;
+                        *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+                        outptr7++;
+                    }
+                }
+                inptr += 96;
+            }
+            else
+            {
+                /* Optimized routine to copy an entire block */
+                __asm __volatile(
+                    // Rows 0-1
+                    "LDR    q16, [%[outptr0]]\n"
+                    "FCVTL2    v17.4s, v16.8h\n"
+                    "LDR    d18, [%[outptr0], #16]\n"
+                    "FCVTL    v16.4s, v16.4h\n"
+                    "LDR    q19, [%[outptr1]]\n"
+                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
+                    "LDR    d21, [%[outptr1], #16]\n"
+                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
+                    "LDP    q0,  q1,  [%[inptr]]\n"
+                    "FCVTL    v18.4s, v18.4h\n"
+                    "LDP    q2,  q3,  [%[inptr], #32]\n"
+                    "FCVTL2    v20.4s, v19.8h\n"
+                    "LDP    q4,  q5,  [%[inptr], #64]\n"
+                    "FCVTL    v19.4s, v19.4h\n" ASM_PREFETCH("[%[inptr], #768]") "FCVTL    v21.4s, v21.4h\n" ASM_PREFETCH("[%[inptr], #832]") "FMUL    v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[inptr], #896]")
+                    "FMUL    v20.4s, v20.4s, %[bv].4s\n" ASM_PREFETCH("[%[inptr], #960]")
+                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
+                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
+                    "FMLA    v16.4s, v0.4s, %[av].4s\n"
+                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
+                    "FCVTN    v16.4h, v16.4s\n"
+                    "FCVTN2    v16.8h, v17.4s\n"
+                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
+                    "STR    q16, [%[outptr0]], #16\n"
+                    "FCVTN    v18.4h, v18.4s\n"
+                    "STR    d18, [%[outptr0]], #8\n"
+                    "FMLA    v19.4s, v3.4s, %[av].4s\n"
+                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
+                    "FCVTN    v19.4h, v19.4s\n"
+                    "FCVTN2    v19.8h, v20.4s\n"
+                    "STR    q19, [%[outptr1]], #16\n"
+                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
+                    "FCVTN    v21.4h, v21.4s\n"
+                    "STR    d21, [%[outptr1]], #8\n"
+
+                    // Rows 2-3
+                    "LDR    q16, [%[outptr2]]\n"
+                    "FCVTL2    v17.4s, v16.8h\n"
+                    "LDR    d18, [%[outptr2], #16]\n"
+                    "FCVTL    v16.4s, v16.4h\n"
+                    "LDR    q19, [%[outptr3]]\n"
+                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
+                    "LDR    d21, [%[outptr3], #16]\n"
+                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
+                    "LDP    q0,  q1,  [%[inptr], #96]\n"
+                    "FCVTL    v18.4s, v18.4h\n"
+                    "LDP    q2,  q3,  [%[inptr], #128]\n"
+                    "FCVTL2    v20.4s, v19.8h\n"
+                    "LDP    q4,  q5,  [%[inptr], #160]\n"
+                    "FCVTL    v19.4s, v19.4h\n" ASM_PREFETCH("[%[inptr], #1024]") "FCVTL    v21.4s, v21.4h\n" ASM_PREFETCH("[%[inptr], #1088]") "FMUL    v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr0], #64]")
+                    "FMUL    v20.4s, v20.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr1], #64]")
+                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
+                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
+                    "FMLA    v16.4s, v0.4s, %[av].4s\n"
+                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
+                    "FCVTN    v16.4h, v16.4s\n"
+                    "FCVTN2    v16.8h, v17.4s\n"
+                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
+                    "STR    q16, [%[outptr2]], #16\n"
+                    "FCVTN    v18.4h, v18.4s\n"
+                    "STR    d18, [%[outptr2]], #8\n"
+                    "FMLA    v19.4s, v3.4s, %[av].4s\n"
+                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
+                    "FCVTN    v19.4h, v19.4s\n"
+                    "FCVTN2    v19.8h, v20.4s\n"
+                    "STR    q19, [%[outptr3]], #16\n"
+                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
+                    "FCVTN    v21.4h, v21.4s\n"
+                    "STR    d21, [%[outptr3]], #8\n"
+
+                    // Rows 4-5
+                    "LDR    q16, [%[outptr4]]\n"
+                    "FCVTL2    v17.4s, v16.8h\n"
+                    "LDR    d18, [%[outptr4], #16]\n"
+                    "FCVTL    v16.4s, v16.4h\n"
+                    "LDR    q19, [%[outptr5]]\n"
+                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
+                    "LDR    d21, [%[outptr5], #16]\n"
+                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
+                    "LDP    q0,  q1,  [%[inptr], #192]\n"
+                    "FCVTL    v18.4s, v18.4h\n"
+                    "LDP    q2,  q3,  [%[inptr], #224]\n"
+                    "FCVTL2    v20.4s, v19.8h\n"
+                    "LDP    q4,  q5,  [%[inptr], #256]\n"
+                    "FCVTL    v19.4s, v19.4h\n" ASM_PREFETCH("[%[outptr2], #64]") "FCVTL    v21.4s, v21.4h\n" ASM_PREFETCH("[%[outptr3], #64]") "FMUL    v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr4], #88]")
+                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
+                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
+                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
+                    "FMLA    v16.4s, v0.4s, %[av].4s\n"
+                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
+                    "FCVTN    v16.4h, v16.4s\n"
+                    "FCVTN2    v16.8h, v17.4s\n"
+                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
+                    "STR    q16, [%[outptr4]], #16\n"
+                    "FCVTN    v18.4h, v18.4s\n"
+                    "STR    d18, [%[outptr4]], #8\n"
+                    "FMLA    v19.4s, v3.4s, %[av].4s\n"
+                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
+                    "FCVTN    v19.4h, v19.4s\n"
+                    "FCVTN2    v19.8h, v20.4s\n"
+                    "STR    q19, [%[outptr5]], #16\n"
+                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
+                    "FCVTN    v21.4h, v21.4s\n"
+                    "STR    d21, [%[outptr5]], #8\n"
+
+                    // Rows 6-7
+                    "LDR    q16, [%[outptr6]]\n"
+                    "FCVTL2    v17.4s, v16.8h\n"
+                    "LDR    d18, [%[outptr6], #16]\n"
+                    "FCVTL    v16.4s, v16.4h\n"
+                    "LDR    q19, [%[outptr7]]\n"
+                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
+                    "LDR    d21, [%[outptr7], #16]\n"
+                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
+                    "LDP    q0,  q1,  [%[inptr], #288]\n"
+                    "FCVTL    v18.4s, v18.4h\n"
+                    "LDP    q2,  q3,  [%[inptr], #320]\n"
+                    "FCVTL2    v20.4s, v19.8h\n"
+                    "LDP    q4,  q5,  [%[inptr], #352]\n"
+                    "FCVTL    v19.4s, v19.4h\n" ASM_PREFETCH("[%[outptr5], #64]") "FCVTL    v21.4s, v21.4h\n" ASM_PREFETCH("[%[outptr6], #88]") "FMUL    v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr7], #88]")
+                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
+                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
+                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
+                    "FMLA    v16.4s, v0.4s, %[av].4s\n"
+                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
+                    "FCVTN    v16.4h, v16.4s\n"
+                    "FCVTN2    v16.8h, v17.4s\n"
+                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
+                    "STR    q16, [%[outptr6]], #16\n"
+                    "FCVTN    v18.4h, v18.4s\n"
+                    "STR    d18, [%[outptr6]], #8\n"
+                    "FMLA    v19.4s, v3.4s, %[av].4s\n"
+                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
+                    "FCVTN    v19.4h, v19.4s\n"
+                    "FCVTN2    v19.8h, v20.4s\n"
+                    "STR    q19, [%[outptr7]], #16\n"
+                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
+                    "FCVTN    v21.4h, v21.4s\n"
+                    "STR    d21, [%[outptr7]], #8\n"
+                    "ADD    %[inptr], %[inptr], #384\n"
+                    : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
+                    [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7),
+                    [inptr] "+r"(inptr)
+                    : [av] "w"(av), [bv] "w"(bv)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21");
+            }
+        }
+    }
+}
+
+#endif // __aarch64__ && __ARM_FP16_ARGS

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp
new file mode 100644
index 0000000..08cfc00
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp

@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+template <>
+inline void MergeResults<24, 8>(__fp16 *out, const __fp16 *in, const int ldout, const int y0, const int ymax,
+                                const int x0, const int xmax, const __fp16 alpha, const __fp16 beta)
+{
+    const __fp16 *inptr = in;
+    prefetch_6x(inptr);
+    prefetch_6x(inptr + 48);
+
+    float16x8_t va = vdupq_n_f16(alpha);
+    float16x8_t vb = vdupq_n_f16(beta);
+
+    for(int y = y0; y < ymax; y += 8)
+    {
+        __fp16 *outptr0 = out + (y * ldout) + x0;
+        __fp16 *outptr1 = outptr0 + ldout;
+        __fp16 *outptr2 = outptr1 + ldout;
+        __fp16 *outptr3 = outptr2 + ldout;
+        __fp16 *outptr4 = outptr3 + ldout;
+        __fp16 *outptr5 = outptr4 + ldout;
+        __fp16 *outptr6 = outptr5 + ldout;
+        __fp16 *outptr7 = outptr6 + ldout;
+
+        prefetch_2x(outptr0);
+        prefetch_2x(outptr1);
+        prefetch_2x(outptr2);
+        prefetch_2x(outptr3);
+        prefetch_2x(outptr4);
+        prefetch_2x(outptr5);
+        prefetch_2x(outptr6);
+        prefetch_2x(outptr7);
+
+        for(int i = x0; i < xmax; i += 24)
+        {
+            __fp16 dummyres[24];
+
+            /* Make sure we throw away results if Y isn't a multiple of 8.
+             * We do this by pointing the result pointer at a dummy buffer
+             * we later discard.  */
+            if((y + 7) >= ymax)
+            {
+                switch((y + 7) - ymax)
+                {
+                    case 6:
+                        outptr1 = dummyres;
+                    case 5:
+                        outptr2 = dummyres;
+                    case 4:
+                        outptr3 = dummyres;
+                    case 3:
+                        outptr4 = dummyres;
+                    case 2:
+                        outptr5 = dummyres;
+                    case 1:
+                        outptr6 = dummyres;
+                    case 0:
+                        outptr7 = dummyres;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            /* For ragged X, manually copy over the valid results. */
+            if((i + 23) >= xmax)
+            {
+                for(int xi = 0; xi < 24; xi++)
+                {
+                    if((i + xi) < xmax)
+                    {
+                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                        outptr0++;
+                        *outptr1 = (alpha * inptr[xi + 24]) + (*outptr1 * beta);
+                        outptr1++;
+                        *outptr2 = (alpha * inptr[xi + 48]) + (*outptr2 * beta);
+                        outptr2++;
+                        *outptr3 = (alpha * inptr[xi + 72]) + (*outptr3 * beta);
+                        outptr3++;
+                        *outptr4 = (alpha * inptr[xi + 96]) + (*outptr4 * beta);
+                        outptr4++;
+                        *outptr5 = (alpha * inptr[xi + 120]) + (*outptr5 * beta);
+                        outptr5++;
+                        *outptr6 = (alpha * inptr[xi + 144]) + (*outptr6 * beta);
+                        outptr6++;
+                        *outptr7 = (alpha * inptr[xi + 168]) + (*outptr7 * beta);
+                        outptr7++;
+                    }
+                }
+                inptr += 192;
+            }
+            else
+            {
+                /* Optimized routine to copy an entire block */
+                __asm __volatile(
+                    ".arch    armv8.2-a+fp16\n"
+                    // Rows 0-1
+                    "LDP    q16, q17, [%[outptr0]]\n"
+                    "FMUL    v16.8h, v16.8h, %[vb].8h\n"
+                    "LDR    q18, [%[outptr0], #32]\n"
+                    "FMUL    v17.8h, v17.8h, %[vb].8h\n"
+                    "LDP    q19, q20, [%[outptr1]]\n"
+                    "FMUL    v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[inptr], #768]")
+                    "LDR    q21, [%[outptr1], #32]\n"
+                    "FMUL    v19.8h, v19.8h, %[vb].8h\n"
+                    "LDP    q0,  q1,  [%[inptr]]\n"
+                    "FMUL    v20.8h, v20.8h, %[vb].8h\n"
+                    "LDP    q2,  q3,  [%[inptr], #32]\n"
+                    "FMUL    v21.8h, v21.8h, %[vb].8h\n"
+                    "LDP    q4,  q5,  [%[inptr], #64]\n"
+                    "FMLA    v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #832]")
+                    "FMLA    v17.8h, v1.8h, %[va].8h\n"
+                    "STP    q16, q17, [%[outptr0]], #32\n"
+                    "FMLA    v18.8h, v2.8h, %[va].8h\n"
+                    "STR    q18, [%[outptr0]], #16\n"
+                    "FMLA    v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #896]")
+                    "FMLA    v20.8h, v4.8h, %[va].8h\n"
+                    "STP    q19, q20, [%[outptr1]], #32\n"
+                    "FMLA    v21.8h, v5.8h, %[va].8h\n"
+                    "STR    q21, [%[outptr1]], #16\n" ASM_PREFETCH("[%[inptr], #960]")
+
+                    // Rows 2-3
+                    "LDP    q16, q17, [%[outptr2]]\n"
+                    "FMUL    v16.8h, v16.8h, %[vb].8h\n"
+                    "LDR    q18, [%[outptr2], #32]\n"
+                    "FMUL    v17.8h, v17.8h, %[vb].8h\n"
+                    "LDP    q19, q20, [%[outptr3]]\n"
+                    "FMUL    v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[inptr], #1024]")
+                    "LDR    q21, [%[outptr3], #32]\n"
+                    "FMUL    v19.8h, v19.8h, %[vb].8h\n"
+                    "LDP    q0,  q1,  [%[inptr], #96]\n"
+                    "FMUL    v20.8h, v20.8h, %[vb].8h\n"
+                    "LDP    q2,  q3,  [%[inptr], #128]\n"
+                    "FMUL    v21.8h, v21.8h, %[vb].8h\n"
+                    "LDP    q4,  q5,  [%[inptr], #160]\n"
+                    "FMLA    v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #1088]")
+                    "FMLA    v17.8h, v1.8h, %[va].8h\n"
+                    "STP    q16, q17, [%[outptr2]], #32\n"
+                    "FMLA    v18.8h, v2.8h, %[va].8h\n"
+                    "STR    q18, [%[outptr2]], #16\n"
+                    "FMLA    v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr0], #80]")
+                    "FMLA    v20.8h, v4.8h, %[va].8h\n"
+                    "STP    q19, q20, [%[outptr3]], #32\n"
+                    "FMLA    v21.8h, v5.8h, %[va].8h\n"
+                    "STR    q21, [%[outptr3]], #16\n" ASM_PREFETCH("[%[outptr1], #80]")
+
+                    // Rows 4-5
+                    "LDP    q16, q17, [%[outptr4]]\n"
+                    "FMUL    v16.8h, v16.8h, %[vb].8h\n"
+                    "LDR    q18, [%[outptr4], #32]\n"
+                    "FMUL    v17.8h, v17.8h, %[vb].8h\n"
+                    "LDP    q19, q20, [%[outptr5]]\n"
+                    "FMUL    v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[outptr2], #80]")
+                    "LDR    q21, [%[outptr5], #32]\n"
+                    "FMUL    v19.8h, v19.8h, %[vb].8h\n"
+                    "LDP    q0,  q1,  [%[inptr], #192]\n"
+                    "FMUL    v20.8h, v20.8h, %[vb].8h\n"
+                    "LDP    q2,  q3,  [%[inptr], #224]\n"
+                    "FMUL    v21.8h, v21.8h, %[vb].8h\n"
+                    "LDP    q4,  q5,  [%[inptr], #256]\n"
+                    "FMLA    v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr3], #80]")
+                    "FMLA    v17.8h, v1.8h, %[va].8h\n"
+                    "STP    q16, q17, [%[outptr4]], #32\n"
+                    "FMLA    v18.8h, v2.8h, %[va].8h\n"
+                    "STR    q18, [%[outptr4]], #16\n"
+                    "FMLA    v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr4], #80]")
+                    "FMLA    v20.8h, v4.8h, %[va].8h\n"
+                    "STP    q19, q20, [%[outptr5]], #32\n"
+                    "FMLA    v21.8h, v5.8h, %[va].8h\n"
+                    "STR    q21, [%[outptr5]], #16\n"
+
+                    // Rows 6-7
+                    "LDP    q16, q17, [%[outptr6]]\n"
+                    "FMUL    v16.8h, v16.8h, %[vb].8h\n"
+                    "LDR    q18, [%[outptr6], #32]\n"
+                    "FMUL    v17.8h, v17.8h, %[vb].8h\n"
+                    "LDP    q19, q20, [%[outptr7]]\n" ASM_PREFETCH("[%[outptr5], #80]")
+                    "FMUL    v18.8h, v18.8h, %[vb].8h\n"
+                    "LDR    q21, [%[outptr7], #32]\n"
+                    "FMUL    v19.8h, v19.8h, %[vb].8h\n"
+                    "LDP    q0,  q1,  [%[inptr], #288]\n"
+                    "FMUL    v20.8h, v20.8h, %[vb].8h\n"
+                    "LDP    q2,  q3,  [%[inptr], #320]\n"
+                    "FMUL    v21.8h, v21.8h, %[vb].8h\n"
+                    "LDP    q4,  q5,  [%[inptr], #352]\n"
+                    "FMLA    v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr6], #128]")
+                    "FMLA    v17.8h, v1.8h, %[va].8h\n"
+                    "STP    q16, q17, [%[outptr6]], #32\n"
+                    "FMLA    v18.8h, v2.8h, %[va].8h\n"
+                    "STR    q18, [%[outptr6]], #16\n"
+                    "FMLA    v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr7], #128]")
+                    "FMLA    v20.8h, v4.8h, %[va].8h\n"
+                    "STP    q19, q20, [%[outptr7]], #32\n"
+                    "FMLA    v21.8h, v5.8h, %[va].8h\n"
+                    "STR    q21, [%[outptr7]], #16\n"
+                    "ADD    %[inptr], %[inptr], #384\n"
+                    : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
+                    [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7),
+                    [inptr] "+r"(inptr)
+                    : [va] "w"(va), [vb] "w"(vb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21");
+            }
+        }
+    }
+}
+
+#endif // __aarch64__ && __ARM_FEATURE_FP16_SCALAR_ARITHMETIC

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp
new file mode 100644
index 0000000..79dd1f0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp

@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+template <>
+inline void MergeResults<12, 8>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t alpha, const int32_t beta)
+{
+    const int32_t *inptr = in;
+    prefetch_6x(inptr);
+    prefetch_6x(inptr + 96);
+
+    int32x4_t alpha_value = vdupq_n_s32(alpha);
+    int32x4_t beta_value  = vdupq_n_s32(beta);
+
+    for(int y = y0; y < ymax; y += 8)
+    {
+        int32_t *outptr0 = out + (y * ldout) + x0;
+        int32_t *outptr1 = outptr0 + ldout;
+        int32_t *outptr2 = outptr1 + ldout;
+        int32_t *outptr3 = outptr2 + ldout;
+        int32_t *outptr4 = outptr3 + ldout;
+        int32_t *outptr5 = outptr4 + ldout;
+        int32_t *outptr6 = outptr5 + ldout;
+        int32_t *outptr7 = outptr6 + ldout;
+
+        prefetch_2x(outptr0);
+        prefetch_2x(outptr1);
+        prefetch_2x(outptr2);
+        prefetch_2x(outptr3);
+        prefetch_2x(outptr4);
+        prefetch_2x(outptr5);
+        prefetch_2x(outptr6);
+        prefetch_2x(outptr7);
+
+        for(int i = x0; i < xmax; i += 12)
+        {
+            int32_t dummyres[12];
+
+            /* Make sure we throw away results if Y isn't a multiple of 8.
+             * We do this by pointing the result pointer at a dummy buffer
+             * we later discard.  */
+            if((y + 7) >= ymax)
+            {
+                switch((y + 7) - ymax)
+                {
+                    case 6:
+                        outptr1 = dummyres;
+                    case 5:
+                        outptr2 = dummyres;
+                    case 4:
+                        outptr3 = dummyres;
+                    case 3:
+                        outptr4 = dummyres;
+                    case 2:
+                        outptr5 = dummyres;
+                    case 1:
+                        outptr6 = dummyres;
+                    case 0:
+                        outptr7 = dummyres;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            /* For ragged X, manually copy over the valid results. */
+            if((i + 11) >= xmax)
+            {
+                for(int xi = 0; xi < 12; xi++)
+                {
+                    if((i + xi) < xmax)
+                    {
+                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                        outptr0++;
+                        *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+                        outptr1++;
+                        *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+                        outptr2++;
+                        *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+                        outptr3++;
+                        *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+                        outptr4++;
+                        *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+                        outptr5++;
+                        *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+                        outptr6++;
+                        *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+                        outptr7++;
+                    }
+                }
+                inptr += 96;
+            }
+            else
+            {
+                /* Optimized routine to copy an entire block */
+                __asm __volatile(
+                    // Row 0
+                    ASM_PREFETCH("[%x[outptr1], #192]")
+                    "ldr q3, [%x[outptr0]]\n"
+                    "ldr q4, [%x[outptr0], #0x10]\n"
+                    "ldr q5, [%x[outptr0], #0x20]\n"
+                    "mul v3.4s, v3.4s, %[beta_value].4s\n"
+                    "ldr q6, [%x[inptr]]\n"
+                    "mul v4.4s, v4.4s, %[beta_value].4s\n"
+                    "ldr q7, [%x[inptr], #0x10]\n"
+                    "mul v5.4s, v5.4s, %[beta_value].4s\n"
+                    "ldr q8, [%x[inptr], #0x20]\n"
+                    "mla v3.4s, v6.4s, %[alpha_value].4s\n"
+                    "ldr q0, [%x[outptr1]]\n"
+                    "mla v4.4s, v7.4s, %[alpha_value].4s\n"
+                    "ldr q1, [%x[outptr1], #0x10]\n"
+                    "mla v5.4s, v8.4s, %[alpha_value].4s\n"
+                    "ldr q2, [%x[outptr1], #0x20]\n"
+
+                    // Row 1
+                    ASM_PREFETCH("[%x[outptr2], #192]")
+                    "mul v0.4s, v0.4s, %[beta_value].4s\n"
+                    "ldr q6, [%x[inptr], #0x30]\n"
+                    "str q3, [%x[outptr0]], #0x10\n"
+                    "mul v1.4s, v1.4s, %[beta_value].4s\n"
+                    "ldr q7, [%x[inptr], #0x40]\n"
+                    "str q4, [%x[outptr0]], #0x10\n"
+                    "mul v2.4s, v2.4s, %[beta_value].4s\n"
+                    "ldr q8, [%x[inptr], #0x50]\n"
+                    "str q5, [%x[outptr0]], #0x10\n"
+                    "mla v0.4s, v6.4s, %[alpha_value].4s\n"
+                    "ldr q3, [%x[outptr2]]\n"
+                    "mla v1.4s, v7.4s, %[alpha_value].4s\n"
+                    "ldr q4, [%x[outptr2], #0x10]\n"
+                    "mla v2.4s, v8.4s, %[alpha_value].4s\n"
+                    "ldr q5, [%x[outptr2], #0x20]\n"
+
+                    // Row 2
+                    ASM_PREFETCH("[%x[outptr3], #192]")
+                    "mul v3.4s, v3.4s, %[beta_value].4s\n"
+                    "ldr q6, [%x[inptr], #0x60]\n"
+                    "str q0, [%x[outptr1]], #0x10\n"
+                    "mul v4.4s, v4.4s, %[beta_value].4s\n"
+                    "ldr q7, [%x[inptr], #0x70]\n"
+                    "str q1, [%x[outptr1]], #0x10\n"
+                    "mul v5.4s, v5.4s, %[beta_value].4s\n"
+                    "ldr q8, [%x[inptr], #0x80]\n"
+                    "str q2, [%x[outptr1]], #0x10\n"
+                    "mla v3.4s, v6.4s, %[alpha_value].4s\n"
+                    "ldr q0, [%x[outptr3]]\n"
+                    "mla v4.4s, v7.4s, %[alpha_value].4s\n"
+                    "ldr q1, [%x[outptr3], #0x10]\n"
+                    "mla v5.4s, v8.4s, %[alpha_value].4s\n"
+                    "ldr q2, [%x[outptr3], #0x20]\n"
+
+                    // Row 3
+                    ASM_PREFETCH("[%x[outptr4], #192]")
+                    "mul v0.4s, v0.4s, %[beta_value].4s\n"
+                    "ldr q6, [%x[inptr], #0x90]\n"
+                    "str q3, [%x[outptr2]], #0x10\n"
+                    "mul v1.4s, v1.4s, %[beta_value].4s\n"
+                    "ldr q7, [%x[inptr], #0xa0]\n"
+                    "str q4, [%x[outptr2]], #0x10\n"
+                    "mul v2.4s, v2.4s, %[beta_value].4s\n"
+                    "ldr q8, [%x[inptr], #0xb0]\n"
+                    "str q5, [%x[outptr2]], #0x10\n"
+                    "mla v0.4s, v6.4s, %[alpha_value].4s\n"
+                    "ldr q3, [%x[outptr4]]\n"
+                    "mla v1.4s, v7.4s, %[alpha_value].4s\n"
+                    "ldr q4, [%x[outptr4], #0x10]\n"
+                    "mla v2.4s, v8.4s, %[alpha_value].4s\n"
+                    "ldr q5, [%x[outptr4], #0x20]\n"
+
+                    // Row 4
+                    ASM_PREFETCH("[%x[outptr5], #192]")
+                    "mul v3.4s, v3.4s, %[beta_value].4s\n"
+                    "ldr q6, [%x[inptr], #0xc0]\n"
+                    "str q0, [%x[outptr3]], #0x10\n"
+                    "mul v4.4s, v4.4s, %[beta_value].4s\n"
+                    "ldr q7, [%x[inptr], #0xd0]\n"
+                    "str q1, [%x[outptr3]], #0x10\n"
+                    "mul v5.4s, v5.4s, %[beta_value].4s\n"
+                    "ldr q8, [%x[inptr], #0xe0]\n"
+                    "str q2, [%x[outptr3]], #0x10\n"
+                    "mla v3.4s, v6.4s, %[alpha_value].4s\n"
+                    "ldr q0, [%x[outptr5]]\n"
+                    "mla v4.4s, v7.4s, %[alpha_value].4s\n"
+                    "ldr q1, [%x[outptr5], #0x10]\n"
+                    "mla v5.4s, v8.4s, %[alpha_value].4s\n"
+                    "ldr q2, [%x[outptr5], #0x20]\n"
+
+                    // Row 5
+                    ASM_PREFETCH("[%x[outptr6], #192]")
+                    "mul v0.4s, v0.4s, %[beta_value].4s\n"
+                    "ldr q6, [%x[inptr], #0xf0]\n"
+                    "str q3, [%x[outptr4]], #0x10\n"
+                    "mul v1.4s, v1.4s, %[beta_value].4s\n"
+                    "ldr q7, [%x[inptr], #0x100]\n"
+                    "str q4, [%x[outptr4]], #0x10\n"
+                    "mul v2.4s, v2.4s, %[beta_value].4s\n"
+                    "ldr q8, [%x[inptr], #0x110]\n"
+                    "str q5, [%x[outptr4]], #0x10\n"
+                    "mla v0.4s, v6.4s, %[alpha_value].4s\n"
+                    "ldr q3, [%x[outptr6]]\n"
+                    "mla v1.4s, v7.4s, %[alpha_value].4s\n"
+                    "ldr q4, [%x[outptr6], #0x10]\n"
+                    "mla v2.4s, v8.4s, %[alpha_value].4s\n"
+                    "ldr q5, [%x[outptr6], #0x20]\n"
+
+                    // Row 6
+                    ASM_PREFETCH("[%x[outptr7], #192]")
+                    "mul v3.4s, v3.4s, %[beta_value].4s\n"
+                    "ldr q6, [%x[inptr], #0x120]\n"
+                    "str q0, [%x[outptr5]], #0x10\n"
+                    "mul v4.4s, v4.4s, %[beta_value].4s\n"
+                    "ldr q7, [%x[inptr], #0x130]\n"
+                    "str q1, [%x[outptr5]], #0x10\n"
+                    "mul v5.4s, v5.4s, %[beta_value].4s\n"
+                    "ldr q8, [%x[inptr], #0x140]\n"
+                    "str q2, [%x[outptr5]], #0x10\n"
+                    "mla v3.4s, v6.4s, %[alpha_value].4s\n"
+                    "ldr q0, [%x[outptr7]]\n"
+                    "mla v4.4s, v7.4s, %[alpha_value].4s\n"
+                    "ldr q1, [%x[outptr7], #0x10]\n"
+                    "mla v5.4s, v8.4s, %[alpha_value].4s\n"
+                    "ldr q2, [%x[outptr7], #0x20]\n"
+
+                    // Row 7
+                    "mul v0.4s, v0.4s, %[beta_value].4s\n"
+                    "ldr q6, [%x[inptr], #0x150]\n"
+                    "str q3, [%x[outptr6]], #0x10\n"
+                    "mul v1.4s, v1.4s, %[beta_value].4s\n"
+                    "ldr q7, [%x[inptr], #0x160]\n"
+                    "str q4, [%x[outptr6]], #0x10\n"
+                    "mul v2.4s, v2.4s, %[beta_value].4s\n"
+                    "ldr q8, [%x[inptr], #0x170]\n"
+                    "str q5, [%x[outptr6]], #0x10\n"
+                    "mla v0.4s, v6.4s, %[alpha_value].4s\n"
+                    "mla v1.4s, v7.4s, %[alpha_value].4s\n"
+                    "mla v2.4s, v8.4s, %[alpha_value].4s\n"
+                    "str q0, [%x[outptr7]], #0x10\n"
+                    "str q1, [%x[outptr7]], #0x10\n"
+                    "str q2, [%x[outptr7]], #0x10\n"
+
+                    "add %x[inptr], %x[inptr], #0x180\n"
+                    : [outptr0] "+r"(outptr0),
+                    [outptr1] "+r"(outptr1),
+                    [outptr2] "+r"(outptr2),
+                    [outptr3] "+r"(outptr3),
+                    [outptr4] "+r"(outptr4),
+                    [outptr5] "+r"(outptr5),
+                    [outptr6] "+r"(outptr6),
+                    [outptr7] "+r"(outptr7),
+                    [inptr] "+r"(inptr)
+                    : [alpha_value] "w"(alpha_value),
+                    [beta_value] "w"(beta_value)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
+            }
+        }
+    }
+}
+
+template <>
+inline void MergeResults<12, 8>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t alpha, const uint32_t beta)
+{
+    // Since the above code uses only MUL and MLA instructions discard the "unsignedness" and proceed safely.
+    MergeResults<12, 8>(reinterpret_cast<int32_t *>(out), reinterpret_cast<const int32_t *>(in), ldout, y0, ymax, x0, xmax, static_cast<const int32_t>(alpha), static_cast<const int32_t>(beta));
+}
+
+#endif // __aarch64__

diff --git a/src/graph/CL/CLUnmap.cpp b/src/core/NEON/kernels/arm_gemm/merges/list.hpp
similarity index 65%
copy from src/graph/CL/CLUnmap.cpp
copy to src/core/NEON/kernels/arm_gemm/merges/list.hpp
index 31f2f19..d93f1b0 100644
--- a/src/graph/CL/CLUnmap.cpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/list.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,23 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/CL/CLUnmap.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute::graph;
-
-CLUnmap::CLUnmap(ITensorObject *tensor)
-    : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor()))
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
-}
-
-void CLUnmap::run()
-{
-    _tensor->unmap(arm_compute::CLScheduler::get().queue());
-}
+#include "a32_merge_float_8x6.hpp"
+#include "a64_merge_float_12x8.hpp"
+#include "a64_merge_float_to_half_12x8.hpp"
+#include "a64_merge_half_24x8.hpp"
+#include "a64_merge_int32_12x8.hpp"

diff --git a/src/core/NEON/kernels/arm_gemm/profiler.hpp b/src/core/NEON/kernels/arm_gemm/profiler.hpp
new file mode 100644
index 0000000..ada0c95
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/profiler.hpp

@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef CYCLE_PROFILING
+
+#include "../perf.h"
+
+#ifndef NO_MULTI_THREADING
+#include <mutex>
+#endif
+
+namespace arm_gemm
+{
+#ifndef NO_MULTI_THREADING
+extern std::mutex report_mutex;
+#endif
+
+class profiler
+{
+private:
+    static const int maxevents         = 100000;
+    unsigned long    times[maxevents]  = {};
+    unsigned long    units[maxevents]  = {};
+    int              events[maxevents] = {};
+    int              currentevent      = 0;
+    int              countfd           = 0;
+
+    class ScopedProfilerClass
+    {
+    private:
+        profiler &_parent;
+        bool      legal = false;
+
+    public:
+        ScopedProfilerClass(profiler &prof, int i, unsigned long u)
+            : _parent(prof)
+        {
+            if(prof.currentevent == maxevents)
+                return;
+
+            prof.events[prof.currentevent] = i;
+            prof.units[prof.currentevent]  = u;
+            legal                          = true;
+            start_counter(prof.countfd);
+        }
+
+        ~ScopedProfilerClass()
+        {
+            if(!legal)
+                return;
+
+            long long cycs                        = stop_counter(_parent.countfd);
+            _parent.times[_parent.currentevent++] = cycs;
+        }
+    };
+
+public:
+    profiler()
+    {
+        countfd = open_cycle_counter();
+    }
+
+    ~profiler()
+    {
+        close(countfd);
+        int           tots[5];
+        unsigned long counts[5];
+        unsigned long tunits[5];
+        const char   *descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" };
+
+        for(int i = 1; i < 5; i++)
+        {
+            tots[i]   = 0;
+            counts[i] = 0;
+            tunits[i] = 0;
+        }
+
+        for(int i = 0; i < currentevent; i++)
+        {
+            //            printf("%10s: %ld\n", descs[events[i]-1], times[i]);
+            tots[events[i]]++;
+            counts[events[i]] += times[i];
+            tunits[events[i]] += units[i];
+        }
+
+#ifdef NO_MULTI_THREADING
+        printf("Profiled events:\n");
+#else
+        std::lock_guard<std::mutex> lock(report_mutex);
+        printf("Profiled events (cpu %d):\n", sched_getcpu());
+#endif
+
+        printf("%20s  %9s %9s %9s %12s %9s\n", "", "Events", "Total", "Average", "Bytes/MACs", "Per cycle");
+        for(int i = 1; i < 5; i++)
+        {
+            printf("%20s: %9d %9ld %9ld %12lu %9.2f\n", descs[i - 1], tots[i], counts[i], counts[i] / tots[i], tunits[i], (float)tunits[i] / counts[i]);
+        }
+    }
+
+    template <typename T>
+    void operator()(int i, unsigned long u, T func)
+    {
+        if(currentevent == maxevents)
+        {
+            func();
+        }
+        else
+        {
+            events[currentevent] = i;
+            units[currentevent]  = u;
+            start_counter(countfd);
+            func();
+            long long cycs        = stop_counter(countfd);
+            times[currentevent++] = cycs;
+        }
+    }
+    ScopedProfilerClass ScopedProfiler(int i, unsigned long u)
+    {
+        return ScopedProfilerClass(*this, i, u);
+    }
+};
+
+#endif // CYCLE_PROFILING
+
+} // namespace arm_gemm
+
+#define PROFILE_PREPA 1
+#define PROFILE_PREPB 2
+#define PROFILE_KERNEL 3
+#define PROFILE_MERGE 4

diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp
new file mode 100644
index 0000000..c80bb59
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transform.hpp

@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+/*
+ * Generic transform.
+ *
+ * Assuming the untransposed case, this works by first reading <BlockBy>
+ * consecutive values from the first input row.  This same number of values
+ * are then read from the next <IntBy-1> rows.  Now return to the first
+ * input row and repeat.
+ *
+ * Need to cope with the work requested in either dimension not actually
+ * being a multiple of the block sizes.
+ */
+template <unsigned IntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize>
+struct TransformImpl
+{
+    template <typename TOut, typename TIn>
+    static void Transform(TOut *out, const TIn *const in, const int stride,
+                          const int y0, const int ymax, const int x0, const int xmax)
+    {
+        const int n_whole_y_blocks = (ymax - y0) / IntBy;
+        const int y_remainders     = (ymax - y0) % IntBy;
+        const int n_y_blocks       = n_whole_y_blocks + (y_remainders ? 1 : 0);
+
+        const int n_whole_x_blocks = (xmax - x0) / BlockBy;
+        const int x_remainders     = (xmax - x0) % BlockBy;
+        const int n_x_blocks       = n_whole_x_blocks + (x_remainders ? 1 : 0);
+
+        // "Y" loop: advance down the rows of the source IntBy rows at a time.
+        // Set up fill_rows to show the number rows to copy from, and blank_rows
+        // for the number of blank rows to add.
+        for(int y_block = 0; y_block < n_y_blocks; y_block++)
+        {
+            int fill_rows  = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
+            int blank_rows = IntBy - fill_rows;
+
+            int y_base = y0 + (y_block * IntBy);
+
+            // So now advance along this block of rows, BlockBy columns at a time.
+            for(int x_block = 0; x_block < n_x_blocks; x_block++)
+            {
+                int fill_cols  = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
+                int blank_cols = BlockBy - fill_cols;
+
+                int x_base = x0 + (x_block * BlockBy);
+
+                for(int row = 0; row < fill_rows; row++)
+                {
+                    for(int col = 0; col < fill_cols; col++)
+                    {
+                        // In-range copy.  If it's transposed, we reverse the sense of rows and columns here.
+                        if(Transposed)
+                        {
+                            *out++ = static_cast<TOut>(in[(x_base + col) * stride + y_base + row]);
+                        }
+                        else
+                        {
+                            *out++ = static_cast<TOut>(in[(y_base + row) * stride + x_base + col]);
+                        }
+                    }
+                    // "col" tail - row is in range but column is out of range.
+                    for(int col = 0; col < blank_cols; col++)
+                    {
+                        *out++ = static_cast<TOut>(0);
+                    }
+                }
+                // "row" tail - row is out of range so fill with zeros always.
+                for(int row = 0; row < blank_rows; row++)
+                {
+                    for(int col = 0; col < (fill_cols + blank_cols); col++)
+                    {
+                        *out++ = static_cast<TOut>(0);
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T>
+    static inline void Transform(T *out, const T *const in, const int stride,
+                                 const int k0, const int kmax, const int x0, const int xmax)
+    {
+        Transform<T, T>(out, in, stride, k0, kmax, x0, xmax);
+    }
+};
+
+/*****************************************************************************/
+template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, typename TOut, typename TIn>
+void Transform(
+    TOut *out, const TIn *const in, const int stride,
+    const int k0, const int kmax, const int x0, const int xmax)
+{
+    // Redirect to a specialised implementation predicated on argument size.
+    TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn)>::Transform(
+        out, in, stride, k0, kmax, x0, xmax);
+}
+/*****************************************************************************/
+
+#include "transforms/list.hpp"

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
new file mode 100644
index 0000000..501d6bf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp

@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+
+template <>
+template <typename T>
+inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+    uint32_t       *outptr = reinterpret_cast<uint32_t *>(out);
+    const uint32_t *inptr  = reinterpret_cast<const uint32_t *>(in);
+
+    uint32_t zerobuff[8];
+
+    for(int y = y0; y < ymax; y += 6)
+    {
+        const uint32_t *inptr0 = inptr + y * ldin + k0;
+        const uint32_t *inptr1 = inptr0 + ldin;
+        const uint32_t *inptr2 = inptr1 + ldin;
+        const uint32_t *inptr3 = inptr2 + ldin;
+        const uint32_t *inptr4 = inptr3 + ldin;
+        const uint32_t *inptr5 = inptr4 + ldin;
+
+        //prefetch_2x(inptr0);
+        //prefetch_2x(inptr1);
+        //prefetch_2x(inptr2);
+        //prefetch_2x(inptr3);
+        //prefetch_2x(inptr4);
+        //prefetch_2x(inptr5);
+
+        int x = (kmax - k0);
+        for(; x > 7; x -= 8)
+        {
+            /* Cope with ragged cases by copying from a buffer of zeroes instead */
+            if((y + 5) >= ymax)
+            {
+                switch((y + 5) - ymax)
+                {
+                    /* Everything falls through in here */
+                    case 4:
+                        inptr1 = zerobuff;
+                    case 3:
+                        inptr2 = zerobuff;
+                    case 2:
+                        inptr3 = zerobuff;
+                    case 1:
+                        inptr4 = zerobuff;
+                    case 0:
+                        inptr5 = zerobuff;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            __asm __volatile(
+                // Load up 8 elements (2 vectors) from each of 8 sources.
+                "VLD1.32    {d0-d3}, [%[inptr0]]!\n"   // q0=A0A1A2A3
+                "VLD1.32    {d4-d7}, [%[inptr1]]!\n"   // q2=B0B1B2B3
+                "VLD1.32    {d8-d11}, [%[inptr2]]!\n"  // q4=C0C1C2C3
+                "VZIP.32    q0, q4\n"                  // q0=A0C0A1C1, q4 = A2C2A3C3
+                "VLD1.32    {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3
+                "VZIP.32    q2, q6\n"                  // q2=B0D0B1D1, q6 = B2D2B3D3
+                "VLD1.32    {d16-d19}, [%[inptr4]]!\n"
+                "VLD1.32    {d20-d23}, [%[inptr5]]!\n"
+                "VZIP.32    q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3
+                ASM_PREFETCH("[%[inptr0], #128]")
+                "VZIP.32    q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1
+
+                // Store first elements
+                "VST1.32    {d0-d1}, [%[outptr]]!\n"
+                "VST1.32    {d16}, [%[outptr]]!\n"
+
+                "VZIP.32    q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3
+
+                // Store second elements
+                "VST1.32    {d4-d5}, [%[outptr]]!\n"
+                "VZIP.32    q1, q5\n" ASM_PREFETCH("[%[inptr1], #128]")
+                "VST1.32    {d17}, [%[outptr]]!\n"
+                "VZIP.32    q3, q7\n"
+
+                // Store third elements
+                "VZIP.32    q9, q11\n"
+                "VST1.32    {d8-d9}, [%[outptr]]!\n"
+                "VZIP.32    q1, q3\n" ASM_PREFETCH("[%[inptr2], #128]")
+                "VST1.32    {d20}, [%[outptr]]!\n"
+
+                // Store fourth elements
+                "VZIP.32    q5, q7\n"
+                "VST1.32    {d12-d13}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr3], #128]")
+                "VST1.32    {d21}, [%[outptr]]!\n"
+
+                // Fifth
+                "VST1.32    {d2-d3}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr4], #128]")
+                "VST1.32    {d18}, [%[outptr]]!\n"
+
+                // Sixth
+                "VST1.32    {d6-d7}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr5], #128]")
+                "VST1.32    {d19}, [%[outptr]]!\n"
+
+                // Seventh
+                "VST1.32    {d10-d11}, [%[outptr]]!\n"
+                "VST1.32    {d22}, [%[outptr]]!\n"
+
+                // Eighth
+                "VST1.32    {d14-d15}, [%[outptr]]!\n"
+                "VST1.32    {d23}, [%[outptr]]!\n"
+
+                : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+                [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [outptr] "+r"(outptr)
+                :
+                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12");
+        }
+
+        for(; x > 0; x--)
+        {
+            *outptr++ = *inptr0++;
+            *outptr++ = *inptr1++;
+            *outptr++ = *inptr2++;
+            *outptr++ = *inptr3++;
+            *outptr++ = *inptr4++;
+            *outptr++ = *inptr5++;
+        }
+    }
+}
+
+#endif // __arm__

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
new file mode 100644
index 0000000..ea32c96
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp

@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+#include "transpose_interleave_common.hpp"
+
+// Generic unblocked transposed 8x32-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<8, 1, true, 4, 4>::Transform(
+    T *out, const T *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    // Redirect to a 16x uint16_t specialisation
+    TransformImpl<16, 1, true, 2, 2>::Transform(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *const>(in),
+        stride * 2, x0 * 2, xmax * 2, k0, kmax);
+}
+
+// Generic 12x16-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<16, 1, true, 2, 2>::Transform(
+    T *out, const T *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    // Redirect to a uint16_t specialisation
+    Transform(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *const>(in),
+        stride, x0, xmax, k0, kmax);
+}
+
+// Specialised 16 x uint16_t version
+template <>
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
+{
+    __asm volatile(
+        "VLD1.32    {d0-d3}, [%[in0]]!\n"
+        "VST1.32    {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+        : [in0] "+r"(in0),
+        [out] "+r"(out)
+        :
+        : "q0", "q1", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
+{
+    __asm volatile(
+        "VLD1.32    {d0-d3}, [%[in0]]!\n"
+        "VST1.32    {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in0], #192]")
+        "VLD1.32    {d0-d3}, [%[in1]]!\n"
+        "VST1.32    {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in1], #192]") "SUB    %[out], %[out], #32\n"
+        : [in0] "+r"(in0),
+        [in1] "+r"(in1),
+        [out] "+r"(out)
+        :
+        : "q0", "q1", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
+{
+    __asm __volatile(
+        "VLD1.32    {d0-d3}, [%[in0]]!\n"
+        "VST1.32    {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in0], #192]")
+        "VLD1.32    {d0-d3}, [%[in1]]!\n"
+        "VST1.32    {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in1], #192]")
+        "VLD1.32    {d0-d3}, [%[in2]]!\n"
+        "VST1.32    {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in2], #192]")
+        "VLD1.32    {d0-d3}, [%[in3]]!\n"
+        "VST1.32    {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in3], #192]") "SUB    %[out], %[out], #96\n"
+        : [in0] "+r"(in0),
+        [in1] "+r"(in1),
+        [in2] "+r"(in2),
+        [in3] "+r"(in3),
+        [out] "+r"(out)
+        :
+        : "q0", "q1", "memory");
+}
+
+template <>
+template <>
+inline void TransformImpl<16, 1, true, 2, 2>::Transform(
+    uint16_t *out, const uint16_t *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+}
+
+#endif // __arm__

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
new file mode 100644
index 0000000..8d61f15
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp

@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+#include "../utils.hpp"
+
+template <>
+template <typename T>
+void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+    uint8_t       *outptr = (uint8_t *)out;
+    const uint8_t *inptr  = (uint8_t *)in;
+
+    uint8_t zerobuff[16];
+
+    for(int y = y0; y < ymax; y += 4)
+    {
+        const uint8_t *inptr0 = inptr + y * ldin + k0;
+        const uint8_t *inptr1 = inptr0 + ldin;
+        const uint8_t *inptr2 = inptr1 + ldin;
+        const uint8_t *inptr3 = inptr2 + ldin;
+
+        prefetch_2x(inptr0);
+        prefetch_2x(inptr1);
+        prefetch_2x(inptr2);
+        prefetch_2x(inptr3);
+
+        int x = (kmax - k0);
+        for(; x > 15; x -= 16)
+        {
+            /* Cope with ragged cases by copying from a buffer of zeroes instead */
+            if((y + 3) >= ymax)
+            {
+                switch((y + 3) - ymax)
+                {
+                    /* Everything falls through in here */
+                    case 2:
+                        inptr1 = zerobuff;
+                    case 1:
+                        inptr2 = zerobuff;
+                    case 0:
+                        inptr3 = zerobuff;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            __asm __volatile(
+                "LDR    q0, [%[inptr0]], #16\n" ASM_PREFETCH("[%[inptr0], #176]") "LDR    q1, [%[inptr1]], #16\n" ASM_PREFETCH("[%[inptr1], #176]")
+                "STP    q0, q1, [%[outptr]], #32\n"
+                "LDR    q0, [%[inptr2]], #16\n" ASM_PREFETCH("[%[inptr2], #176]") "LDR    q1, [%[inptr3]], #16\n" ASM_PREFETCH("[%[inptr3], #176]") "STP    q0, q1, [%[outptr]], #32\n"
+                : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+                [outptr] "+r"(outptr)
+                :
+                : "v0", "v1");
+        }
+
+        if(x > 0)
+        {
+            /* Need to duplicate this here, in case we didn't run the main loop. */
+            if((y + 3) >= ymax)
+            {
+                switch((y + 3) - ymax)
+                {
+                    /* Everything falls through in here */
+                    case 2:
+                        inptr1 = zerobuff;
+                    case 1:
+                        inptr2 = zerobuff;
+                    case 0:
+                        inptr3 = zerobuff;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            /* We have to write out 16 values, copy as many legal values as there are and pad with 0 */
+            auto f = [&outptr, x](const uint8_t *&p)
+            {
+                for(int i = 0; i < 16; i++)
+                {
+                    if(i < x)
+                    {
+                        *outptr++ = *p++;
+                    }
+                    else
+                    {
+                        *outptr++ = 0;
+                    }
+                }
+            };
+
+            f(inptr0);
+            f(inptr1);
+            f(inptr2);
+            f(inptr3);
+        }
+    }
+}
+
+#endif // __aarch64__
\ No newline at end of file

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
new file mode 100644
index 0000000..3cbc881
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp

@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+
+template <>
+template <typename T>
+void TransformImpl<8, 1, false, 2, 2>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+    uint16_t       *outptr = (uint16_t *)out;
+    const uint16_t *inptr  = (const uint16_t *)in;
+
+    uint16_t zerobuff[24];
+
+    for(int y = y0; y < ymax; y += 8)
+    {
+        const uint16_t *inptr0 = inptr + y * ldin + k0;
+        const uint16_t *inptr1 = inptr0 + ldin;
+        const uint16_t *inptr2 = inptr1 + ldin;
+        const uint16_t *inptr3 = inptr2 + ldin;
+        const uint16_t *inptr4 = inptr3 + ldin;
+        const uint16_t *inptr5 = inptr4 + ldin;
+        const uint16_t *inptr6 = inptr5 + ldin;
+        const uint16_t *inptr7 = inptr6 + ldin;
+
+        prefetch_2x(inptr0);
+        prefetch_2x(inptr1);
+        prefetch_2x(inptr2);
+        prefetch_2x(inptr3);
+        prefetch_2x(inptr4);
+        prefetch_2x(inptr5);
+        prefetch_2x(inptr6);
+        prefetch_2x(inptr7);
+
+        int x = (kmax - k0);
+        for(; x > 7; x -= 8)
+        {
+            /* Cope with ragged cases by copying from a buffer of zeroes instead */
+            if((y + 7) >= ymax)
+            {
+                switch((y + 7) - ymax)
+                {
+                    /* Everything falls through in here */
+                    case 6:
+                        inptr1 = zerobuff;
+                    case 5:
+                        inptr2 = zerobuff;
+                    case 4:
+                        inptr3 = zerobuff;
+                    case 3:
+                        inptr4 = zerobuff;
+                    case 2:
+                        inptr5 = zerobuff;
+                    case 1:
+                        inptr6 = zerobuff;
+                    case 0:
+                        inptr7 = zerobuff;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            int skippf = (x & 31);
+            __asm __volatile(
+                // Load up 8 elements (1 vector) from each of 8 sources.
+                "CBNZ    %w[skippf], 1f\n" ASM_PREFETCH("[%[inptr0], #128]")
+                ASM_PREFETCH("[%[inptr1], #128]")
+                ASM_PREFETCH("[%[inptr2], #128]")
+                ASM_PREFETCH("[%[inptr3], #128]")
+                "1:\n"
+
+                "LDR    q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
+                "LDR    q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
+                "LDR    q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
+                "LDR    q6, [%[inptr6]], #16\n"
+                "ZIP1    v8.8h, v0.8h, v4.8h\n"  // q8=A0E0A1E1A2E2A3E3
+                "ZIP2    v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
+                "ZIP1    v9.8h, v2.8h, v6.8h\n"  // q9=C0G0C1G1C2G2C3G3
+                "ZIP2    v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
+                "LDR    q1, [%[inptr1]], #16\n"  // q1=B0B1B2B3B4B5B6B7
+                "LDR    q5, [%[inptr5]], #16\n"
+                "LDR    q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
+                "LDR    q7, [%[inptr7]], #16\n"
+                "ZIP1    v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
+                "ZIP2    v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
+                "ZIP1    v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
+                "ZIP2    v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
+
+                "ZIP1    v12.8h,  v8.8h,  v9.8h\n" // q20=A0C0E0G0A1C1E1G1
+                "ZIP2    v20.8h,  v8.8h,  v9.8h\n"
+                "ZIP1    v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
+                "ZIP2    v21.8h, v10.8h, v11.8h\n"
+
+                "CBNZ    %w[skippf], 2f\n" ASM_PREFETCH("[%[inptr4], #112]")
+                ASM_PREFETCH("[%[inptr5], #112]")
+                ASM_PREFETCH("[%[inptr6], #112]")
+                ASM_PREFETCH("[%[inptr7], #112]")
+                "2:\n"
+
+                "ZIP1    v22.8h, v16.8h, v17.8h\n"
+                "ZIP2    v30.8h, v16.8h, v17.8h\n"
+                "ZIP1    v23.8h, v18.8h, v19.8h\n"
+                "ZIP2    v31.8h, v18.8h, v19.8h\n"
+
+                "ZIP1    v14.8h, v12.8h, v13.8h\n"    // q22=A0B0C0D0E0F0G0H0
+                "ZIP2    v15.8h, v12.8h, v13.8h\n"    // q23=A1B1C1D1E1F1G1H1
+                "STP    q14, q15, [%[outptr]], #32\n" // Write back first two elements
+
+                "ZIP1    v0.8h, v20.8h, v21.8h\n"
+                "ZIP2    v1.8h, v20.8h, v21.8h\n"
+                "STP    q0, q1, [%[outptr]], #32\n" // Write back next two elements
+
+                "ZIP1    v2.8h, v22.8h, v23.8h\n"
+                "ZIP2    v3.8h, v22.8h, v23.8h\n"
+                "STP    q2, q3, [%[outptr]], #32\n" // Write back next two elements
+
+                "ZIP1    v4.8h, v30.8h, v31.8h\n"
+                "ZIP2    v5.8h, v30.8h, v31.8h\n"
+                "STP    q4, q5, [%[outptr]], #32\n" // Write back last two elements
+                : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+                [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
+                : [skippf] "r"(skippf)
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+                "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+                "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+        }
+
+        for(; x > 0; x--)
+        {
+            *outptr++ = *inptr0++;
+            *outptr++ = *inptr1++;
+            *outptr++ = *inptr2++;
+            *outptr++ = *inptr3++;
+            *outptr++ = *inptr4++;
+            *outptr++ = *inptr5++;
+            *outptr++ = *inptr6++;
+            *outptr++ = *inptr7++;
+        }
+    }
+}
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
new file mode 100644
index 0000000..47e4fa2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp

@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+
+template <>
+template <typename T>
+inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+    uint32_t       *outptr = (uint32_t *)out;
+    const uint32_t *inptr  = (uint32_t *)in;
+
+    uint32_t zerobuff[8];
+
+    for(int y = y0; y < ymax; y += 8)
+    {
+        const uint32_t *inptr0 = inptr + y * ldin + k0;
+        const uint32_t *inptr1 = inptr0 + ldin;
+        const uint32_t *inptr2 = inptr1 + ldin;
+        const uint32_t *inptr3 = inptr2 + ldin;
+        const uint32_t *inptr4 = inptr3 + ldin;
+        const uint32_t *inptr5 = inptr4 + ldin;
+        const uint32_t *inptr6 = inptr5 + ldin;
+        const uint32_t *inptr7 = inptr6 + ldin;
+
+        prefetch_2x(inptr0);
+        prefetch_2x(inptr1);
+        prefetch_2x(inptr2);
+        prefetch_2x(inptr3);
+        prefetch_2x(inptr4);
+        prefetch_2x(inptr5);
+        prefetch_2x(inptr6);
+        prefetch_2x(inptr7);
+
+        int x = (kmax - k0);
+        for(; x > 7; x -= 8)
+        {
+            /* Cope with ragged cases by copying from a buffer of zeroes instead */
+            if((y + 7) >= ymax)
+            {
+                switch((y + 7) - ymax)
+                {
+                    /* Everything falls through in here */
+                    case 6:
+                        inptr1 = zerobuff;
+                    case 5:
+                        inptr2 = zerobuff;
+                    case 4:
+                        inptr3 = zerobuff;
+                    case 3:
+                        inptr4 = zerobuff;
+                    case 2:
+                        inptr5 = zerobuff;
+                    case 1:
+                        inptr6 = zerobuff;
+                    case 0:
+                        inptr7 = zerobuff;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            __asm __volatile(
+                // Load up 8 elements (2 vectors) from each of 8 sources.
+                "LDP        q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3
+                "LDP        q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3
+                "LDP        q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3
+                "ZIP1       v16.4s, v0.4s, v4.4s\n"     // q16=A0C0A1C1
+                ASM_PREFETCH("[%[inptr0], #128]")
+                "LDP        q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3
+                "ZIP1       v17.4s, v2.4s, v6.4s\n"     // q17=B0D0B1D1
+                "LDP        q8, q9, [%[inptr4]], #32\n"
+                "LDP        q10, q11, [%[inptr5]], #32\n"
+                "LDP        q12, q13, [%[inptr6]], #32\n"
+                "ZIP1       v18.4s, v8.4s, v12.4s\n" ASM_PREFETCH("[%[inptr1], #128]")
+                "LDP        q14, q15, [%[inptr7]], #32\n"
+                "ZIP1       v19.4s, v10.4s, v14.4s\n"
+
+                "ZIP1       v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
+                ASM_PREFETCH("[%[inptr2], #128]")
+                "ZIP1       v21.4s, v18.4s, v19.4s\n"
+                "ZIP2       v22.4s, v16.4s, v17.4s\n"
+                "ZIP2       v23.4s, v18.4s, v19.4s\n"
+
+                "ZIP2       v16.4s, v0.4s, v4.4s\n" ASM_PREFETCH("[%[inptr3], #128]")
+                "ZIP2       v17.4s, v2.4s, v6.4s\n"
+                "STP        q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
+
+                "ZIP2       v18.4s, v8.4s, v12.4s\n"
+                "ZIP2       v19.4s, v10.4s, v14.4s\n"
+                "STP        q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
+
+                "ZIP1       v20.4s, v16.4s, v17.4s\n" ASM_PREFETCH("[%[inptr4], #128]")
+                "ZIP1       v21.4s, v18.4s, v19.4s\n"
+                "ZIP2       v22.4s, v16.4s, v17.4s\n"
+                "ZIP2       v23.4s, v18.4s, v19.4s\n"
+
+                "ZIP1       v16.4s, v1.4s, v5.4s\n" ASM_PREFETCH("[%[inptr5], #128]")
+                "ZIP1       v17.4s, v3.4s, v7.4s\n"
+                "STP        q20, q21, [%[outptr]], #32\n" // Third element
+
+                "ZIP1       v18.4s, v9.4s, v13.4s\n"
+                "ZIP1       v19.4s, v11.4s, v15.4s\n"
+                "STP        q22, q23, [%[outptr]], #32\n" // Fourth element
+
+                "ZIP1       v20.4s, v16.4s, v17.4s\n"
+                "ZIP1       v21.4s, v18.4s, v19.4s\n"
+                "ZIP2       v22.4s, v16.4s, v17.4s\n" ASM_PREFETCH("[%[inptr6], #128]")
+                "ZIP2       v23.4s, v18.4s, v19.4s\n"
+
+                "ZIP2       v16.4s, v1.4s, v5.4s\n"
+                "ZIP2       v17.4s, v3.4s, v7.4s\n"
+                "STP        q20, q21, [%[outptr]], #32\n" // Fifth element
+
+                "ZIP2       v18.4s, v9.4s, v13.4s\n" ASM_PREFETCH("[%[inptr7], #128]")
+                "ZIP2       v19.4s, v11.4s, v15.4s\n"
+                "STP        q22, q23, [%[outptr]], #32\n" // Sixth element
+
+                "ZIP1       v20.4s, v16.4s, v17.4s\n"
+                "ZIP1       v21.4s, v18.4s, v19.4s\n"
+                "STP        q20, q21, [%[outptr]], #32\n" // Seventh element
+
+                "ZIP2       v22.4s, v16.4s, v17.4s\n"
+                "ZIP2       v23.4s, v18.4s, v19.4s\n"
+                "STP        q22, q23, [%[outptr]], #32\n" // Eighth element
+                : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+                [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
+                :
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+                "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+        }
+
+        for(; x > 0; x--)
+        {
+            *outptr++ = *inptr0++;
+            *outptr++ = *inptr1++;
+            *outptr++ = *inptr2++;
+            *outptr++ = *inptr3++;
+            *outptr++ = *inptr4++;
+            *outptr++ = *inptr5++;
+            *outptr++ = *inptr6++;
+            *outptr++ = *inptr7++;
+        }
+    }
+}
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
new file mode 100644
index 0000000..1d2d496
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp

@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+
+template <>
+template <>
+inline void TransformImpl<8, 1, false, 4, 2>::Transform(float *out, const __fp16 *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+    float        *outptr = out;
+    const __fp16 *inptr  = in;
+
+    __fp16 zerobuff[8];
+
+    for(int y = y0; y < ymax; y += 8)
+    {
+        const __fp16 *inptr0 = inptr + y * ldin + k0;
+        const __fp16 *inptr1 = inptr0 + ldin;
+        const __fp16 *inptr2 = inptr1 + ldin;
+        const __fp16 *inptr3 = inptr2 + ldin;
+        const __fp16 *inptr4 = inptr3 + ldin;
+        const __fp16 *inptr5 = inptr4 + ldin;
+        const __fp16 *inptr6 = inptr5 + ldin;
+        const __fp16 *inptr7 = inptr6 + ldin;
+
+        prefetch_2x(inptr0);
+        prefetch_2x(inptr1);
+        prefetch_2x(inptr2);
+        prefetch_2x(inptr3);
+        prefetch_2x(inptr4);
+        prefetch_2x(inptr5);
+        prefetch_2x(inptr6);
+        prefetch_2x(inptr7);
+
+        int x = (kmax - k0);
+        for(; x > 7; x -= 8)
+        {
+            /* Cope with ragged cases by copying from a buffer of zeroes instead */
+            if((y + 7) >= ymax)
+            {
+                switch((y + 7) - ymax)
+                {
+                    /* Everything falls through in here */
+                    case 6:
+                        inptr1 = zerobuff;
+                    case 5:
+                        inptr2 = zerobuff;
+                    case 4:
+                        inptr3 = zerobuff;
+                    case 3:
+                        inptr4 = zerobuff;
+                    case 2:
+                        inptr5 = zerobuff;
+                    case 1:
+                        inptr6 = zerobuff;
+                    case 0:
+                        inptr7 = zerobuff;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            __asm __volatile(
+                // Load up 8 elements (2 vectors) from each of 8 sources.
+                "LDR    q0, [%[inptr0]], #16\n"
+                "LDR    q2, [%[inptr1]], #16\n"
+                "FCVTL2    v1.4s, v0.8h\n"
+                "FCVTL    v0.4s, v0.4h\n"
+                "LDR    q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
+                "FCVTL2    v3.4s, v2.8h\n"
+                "FCVTL    v2.4s, v2.4h\n"
+                "FCVTL2    v5.4s, v4.8h\n"
+                "FCVTL    v4.4s, v4.4h\n"
+                "ZIP1    v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
+                ASM_PREFETCH("[%[inptr0], #128]")
+                "LDR    q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
+                "FCVTL2    v7.4s, v6.8h\n"
+                "FCVTL    v6.4s, v6.4h\n"
+                "ZIP1    v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
+                "LDR    q8, [%[inptr4]], #16\n"
+                "LDR    q10, [%[inptr5]], #16\n"
+                "FCVTL2    v9.4s, v8.8h\n"
+                "FCVTL    v8.4s, v8.4h\n" ASM_PREFETCH("[%[inptr1], #128]")
+                "LDR    q12, [%[inptr6]], #16\n"
+                "FCVTL2    v11.4s, v10.8h\n"
+                "FCVTL    v10.4s, v10.4h\n"
+                "FCVTL2    v13.4s, v12.8h\n"
+                "FCVTL    v12.4s, v12.4h\n"
+                "ZIP1    v18.4s, v8.4s, v12.4s\n"
+                "LDR    q14, [%[inptr7]], #16\n"
+                "FCVTL2    v15.4s, v14.8h\n"
+                "FCVTL    v14.4s, v14.4h\n"
+                "ZIP1    v19.4s, v10.4s, v14.4s\n"
+
+                ASM_PREFETCH("[%[inptr2], #128]")
+                "ZIP1    v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
+                "ZIP1    v21.4s, v18.4s, v19.4s\n"
+                "ZIP2    v22.4s, v16.4s, v17.4s\n"
+                "ZIP2    v23.4s, v18.4s, v19.4s\n" ASM_PREFETCH("[%[inptr3], #128]")
+
+                "ZIP2    v16.4s, v0.4s, v4.4s\n"
+                "ZIP2    v17.4s, v2.4s, v6.4s\n"
+                "STP    q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
+
+                "ZIP2    v18.4s, v8.4s, v12.4s\n" ASM_PREFETCH("[%[inptr4], #128]")
+                "ZIP2    v19.4s, v10.4s, v14.4s\n"
+                "STP    q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
+
+                "ZIP1    v20.4s, v16.4s, v17.4s\n"
+                "ZIP1    v21.4s, v18.4s, v19.4s\n" ASM_PREFETCH("[%[inptr5], #128]")
+                "ZIP2    v22.4s, v16.4s, v17.4s\n"
+                "ZIP2    v23.4s, v18.4s, v19.4s\n"
+
+                "ZIP1    v16.4s, v1.4s, v5.4s\n"
+                "ZIP1    v17.4s, v3.4s, v7.4s\n" ASM_PREFETCH("[%[inptr6], #128]")
+                "STP    q20, q21, [%[outptr]], #32\n" // Third element
+
+                "ZIP1    v18.4s, v9.4s, v13.4s\n"
+                "ZIP1    v19.4s, v11.4s, v15.4s\n"
+                "STP    q22, q23, [%[outptr]], #32\n" // Fourth element
+                ASM_PREFETCH("[%[inptr7], #128]")
+
+                "ZIP1    v20.4s, v16.4s, v17.4s\n"
+                "ZIP1    v21.4s, v18.4s, v19.4s\n"
+                "ZIP2    v22.4s, v16.4s, v17.4s\n"
+                "ZIP2    v23.4s, v18.4s, v19.4s\n"
+
+                "ZIP2    v16.4s, v1.4s, v5.4s\n"
+                "ZIP2    v17.4s, v3.4s, v7.4s\n"
+                "STP    q20, q21, [%[outptr]], #32\n" // Fifth element
+
+                "ZIP2    v18.4s, v9.4s, v13.4s\n"
+                "ZIP2    v19.4s, v11.4s, v15.4s\n"
+                "STP    q22, q23, [%[outptr]], #32\n" // Sixth element
+
+                "ZIP1    v20.4s, v16.4s, v17.4s\n"
+                "ZIP1    v21.4s, v18.4s, v19.4s\n"
+                "STP    q20, q21, [%[outptr]], #32\n" // Seventh element
+
+                "ZIP2    v22.4s, v16.4s, v17.4s\n"
+                "ZIP2    v23.4s, v18.4s, v19.4s\n"
+                "STP    q22, q23, [%[outptr]], #32\n" // Eighth element
+                : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+                [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
+                :
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+                "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+        }
+
+        for(; x > 0; x--)
+        {
+            *outptr++ = *inptr0++;
+            *outptr++ = *inptr1++;
+            *outptr++ = *inptr2++;
+            *outptr++ = *inptr3++;
+            *outptr++ = *inptr4++;
+            *outptr++ = *inptr5++;
+            *outptr++ = *inptr6++;
+            *outptr++ = *inptr7++;
+        }
+    }
+}
+
+#endif // __aarch64__ && __ARM_FP16_ARGS

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
new file mode 100644
index 0000000..fd6a253
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp

@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "transpose_interleave_common.hpp"
+
+// Generic unblocked transposed 6x32-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<6, 1, true, 4, 4>::Transform(
+    T *out, const T *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    // Redirect to a 12 x uint16_t specialisation
+    TransformImpl<12, 1, true, 2, 2>::Transform(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *const>(in),
+        stride * 2, x0 * 2, xmax * 2, k0, kmax);
+}
+
+// Generic 12x16-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<12, 1, true, 2, 2>::Transform(
+    T *out, const T *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    // Redirect to a uint16_t specialisation
+    Transform(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *const>(in),
+        stride, x0, xmax, k0, kmax);
+}
+
+// Specialised 12 x uint16_t version
+template <>
+inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
+{
+    __asm volatile(
+        "LDR q0, [%[in0]]\n"
+        "STR q0, [%[out]]\n"
+        "LDR d1, [%[in0], #0x10]\n"
+        "STR d1, [%[out], #0x10]\n"
+        "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]")
+        : [in0] "+r"(in0),
+        [out] "+r"(out)
+        :
+        : "v0", "v1", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
+{
+    __asm volatile(
+        "LDR q0, [%[in0]]\n"
+        "LDR d1, [%[in0], #0x10]\n"
+        "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]")
+
+        "LDR x21, [%[in1]]\n"
+        "LDR q2, [%[in1], #0x08]\n"
+        "INS v1.d[1], x21\n"
+        "ADD %x[in1], %x[in1], #0x18\n"
+        "STP q0, q1, [%[out]]\n"
+        "STR q2, [%x[out], #0x20]\n" ASM_PREFETCH("[%[in1], #192]")
+        : [in0] "+r"(in0),
+        [in1] "+r"(in1),
+        [out] "+r"(out)
+        :
+        : "x21", "v0", "v1", "v2", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
+{
+    __asm __volatile(
+        "LDR q0, [%x[in0]], #0x10\n"
+        "STR q0, [%x[out]]\n"
+        "LDR d1, [%x[in0]], #0x08\n" ASM_PREFETCH("[%[in0], #192]")
+        "STR d1, [%x[out], #0x10]\n"
+
+        "LDR q0, [%x[in1]], #0x10\n"
+        "STR q0, [%x[out], #0x18]\n"
+        "LDR d1, [%x[in1]], #0x08\n" ASM_PREFETCH("[%[in1], #192]")
+        "STR d1, [%x[out], #0x28]\n"
+
+        "LDR q0, [%x[in2]], #0x10\n"
+        "STR q0, [%x[out], #0x30]\n"
+        "LDR d1, [%x[in2]], #0x08\n" ASM_PREFETCH("[%[in2], #192]")
+        "STR d1, [%x[out], #0x40]\n"
+
+        "LDR q0, [%x[in3]], #0x10\n"
+        "STR q0, [%x[out], #0x48]\n"
+        "LDR d1, [%x[in3]], #0x08\n" ASM_PREFETCH("[%[in3], #192]") "STR d1, [%x[out], #0x58]\n"
+        : [in0] "+r"(in0),
+        [in1] "+r"(in1),
+        [in2] "+r"(in2),
+        [in3] "+r"(in3),
+        [out] "+r"(out)
+        :
+        : "v0", "v1", "memory");
+}
+
+template <>
+template <>
+inline void TransformImpl<12, 1, true, 2, 2>::Transform(
+    uint16_t *out, const uint16_t *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+}
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
new file mode 100644
index 0000000..b79f32f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp

@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
+
+#include "transpose_interleave_common.hpp"
+
+template <>
+inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x1(const __fp16 *&in0, float *out)
+{
+    __asm __volatile(
+        "LDR    q0, [%[in0]], #16\n"
+        "FCVTL2    v1.4s, v0.8h\n"
+        "FCVTL    v0.4s, v0.4h\n"
+        "STP    q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+        "LDR    d2, [%[in0]], #8\n"
+        "FCVTL    v2.4s, v2.4h\n"
+        "STR    q2, [%[out], #32]\n"
+        : [in0] "+r"(in0), [out] "+r"(out)
+        :
+        : "v0", "v1", "v2", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x2(const __fp16 *&in0, const __fp16 *&in1, float *out)
+{
+    __asm __volatile(
+        "LDR    q0, [%[in0]], #16\n"
+        "FCVTL2    v1.4s, v0.8h\n"
+        "FCVTL    v0.4s, v0.4h\n"
+        "STP    q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+        "LDR    d2, [%[in0]], #8\n"
+        "FCVTL    v2.4s, v2.4h\n"
+        "LDR    q3, [%[in1]], #16\n"
+        "FCVTL2    v4.4s, v3.8h\n"
+        "FCVTL    v3.4s, v3.4h\n"
+        "STP    q2, q3, [%[out], #32]\n" ASM_PREFETCH("[%[in1], #192]")
+        "LDR    d5, [%[in1]], #16\n"
+        "FCVTL    v5.4s, v5.4h\n"
+        "STP    q4, q5, [%[out], #64]\n"
+        : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
+        :
+        : "v0", "v1", "v2", "v3", "v4", "v5", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __fp16 *&in0, const __fp16 *&in1, const __fp16 *&in2, const __fp16 *&in3, float *out)
+{
+    __asm __volatile(
+        "LDR    q0, [%[in0]], #16\n"
+        "FCVTL2    v1.4s, v0.8h\n"
+        "FCVTL    v0.4s, v0.4h\n"
+        "STP    q0, q1, [%[out]]\n"
+        "LDR    d2, [%[in0]], #8\n" ASM_PREFETCH("[%[in0], #192]")
+        "FCVTL    v2.4s, v2.4h\n"
+        "LDR    q3, [%[in1]], #16\n"
+        "FCVTL2    v4.4s, v3.8h\n"
+        "FCVTL    v3.4s, v3.4h\n"
+        "STP    q2, q3, [%[out], #32]\n"
+        "LDR    d5, [%[in1]], #8\n"
+        "FCVTL    v5.4s, v5.4h\n" ASM_PREFETCH("[%[in1], #192]")
+        "STP    q4, q5, [%[out], #64]\n"
+        "LDR    q6, [%[in2]], #16\n"
+        "FCVTL2    v7.4s, v6.8h\n"
+        "FCVTL    v6.4s, v6.4h\n"
+        "STP    q6, q7, [%[out], #96]\n"
+        "LDR    d8, [%[in2]], #8\n"
+        "FCVTL    v8.4s, v8.4h\n" ASM_PREFETCH("[%[in2], #192]")
+        "LDR    q9, [%[in3]], #16\n"
+        "FCVTL2    v10.4s, v9.8h\n"
+        "FCVTL    v9.4s, v9.4h\n"
+        "STP    q8, q9, [%[out], #128]\n"
+        "LDR    d11, [%[in3]], #8\n"
+        "FCVTL    v11.4s, v11.4h\n"
+        "STP    q10, q11, [%[out], #160]\n" ASM_PREFETCH("[%[in3], #192]")
+
+        : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3), [out] "+r"(out)
+        :
+        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory");
+}
+
+template <>
+template <>
+inline void TransformImpl<12, 1, true, 4, 2>::Transform(
+    float *out, const __fp16 *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    TransposeInterleaveCommon<12, __fp16, float>::Transform(out, in, stride, x0, xmax, k0, kmax);
+}
+
+#endif // __aarch64__ && __ARM_FP16_ARGS

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
new file mode 100644
index 0000000..5434599
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp

@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "transpose_interleave_common.hpp"
+
+// Generic unblocked transposed 12x32-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<12, 1, true, 4, 4>::Transform(
+    T *out, const T *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    // Redirect to a 24 x uint16_t specialisation
+    TransformImpl<24, 1, true, 2, 2>::Transform(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *const>(in),
+        stride * 2, x0 * 2, xmax * 2, k0, kmax);
+}
+
+// Generic 24x16-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<24, 1, true, 2, 2>::Transform(
+    T *out, const T *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    // Redirect to a uint16_t specialisation
+    Transform(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *const>(in),
+        stride, x0, xmax, k0, kmax);
+}
+
+// Specialised 24 x uint16_t version
+template <>
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
+{
+    __asm __volatile(
+        "LDP    q0, q1, [%[in0]], #32\n"
+        "STP    q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+        "LDR    q2, [%[in0]], #16\n"
+        "STR    q2, [%[out], #32]\n"
+        : [in0] "+r"(in0), [out] "+r"(out)
+        :
+        : "v0", "v1", "v2", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
+{
+    __asm __volatile(
+        "LDP    q0, q1, [%[in0]], #32\n"
+        "STP    q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+        "LDR    q2, [%[in0]], #16\n"
+        "LDP    q3, q4, [%[in1]], #32\n"
+        "STP    q2, q3, [%[out], #32]\n" ASM_PREFETCH("[%[in1], #192]")
+        "LDR    q5, [%[in1]], #16\n"
+        "STP    q4, q5, [%[out], #64]\n"
+        : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
+        :
+        : "v0", "v1", "v2", "v3", "v4", "v5", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
+{
+    __asm __volatile(
+        "LDP    q0, q1, [%[in0]], #32\n"
+        "STP    q0, q1, [%[out]]\n"
+        "LDR    q2, [%[in0]], #16\n" ASM_PREFETCH("[%[in0], #192]")
+        "LDP    q3, q4, [%[in1]], #32\n"
+        "STP    q2, q3, [%[out], #32]\n"
+        "LDR    q5, [%[in1]], #16\n" ASM_PREFETCH("[%[in1], #192]")
+        "STP    q4, q5, [%[out], #64]\n"
+        "LDP    q6, q7, [%[in2]], #32\n"
+        "STP    q6, q7, [%[out], #96]\n"
+        "LDR    q8, [%[in2]], #16\n" ASM_PREFETCH("[%[in2], #192]")
+        "LDP    q9, q10, [%[in3]], #32\n"
+        "STP    q8, q9, [%[out], #128]\n"
+        "LDR    q11, [%[in3]], #16\n"
+        "STP    q10, q11, [%[out], #160]\n" ASM_PREFETCH("[%[in3], #192]")
+
+        : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3), [out] "+r"(out)
+        :
+        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory");
+}
+
+template <>
+template <>
+inline void TransformImpl<24, 1, true, 2, 2>::Transform(
+    uint16_t *out, const uint16_t *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+}
+
+#endif // __arch64__

diff --git a/src/graph/CL/CLUnmap.cpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
similarity index 65%
copy from src/graph/CL/CLUnmap.cpp
copy to src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index 31f2f19..8ad5b85 100644
--- a/src/graph/CL/CLUnmap.cpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,23 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/CL/CLUnmap.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute::graph;
-
-CLUnmap::CLUnmap(ITensorObject *tensor)
-    : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor()))
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
-}
-
-void CLUnmap::run()
-{
-    _tensor->unmap(arm_compute::CLScheduler::get().queue());
-}
+#include "a32_interleave_6way_32bit.hpp"
+#include "a32_transpose_interleave_8way_32bit.hpp"
+#include "a64_block16_interleave4_8bit.hpp"
+#include "a64_interleave_8way_16bit.hpp"
+#include "a64_interleave_8way_32bit.hpp"
+#include "a64_interleave_8way_half_to_float.hpp"
+#include "a64_transpose_interleave_12way_16bit.hpp"
+#include "a64_transpose_interleave_12way_half_to_float.hpp"
+#include "a64_transpose_interleave_24way_16bit.hpp"
+#include "transpose_interleave_common.hpp"

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
new file mode 100644
index 0000000..3218ca1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp

@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+template <unsigned int IntBy, typename TIn, typename TOut>
+struct TransposeInterleaveCommon
+{
+    // Override the moveblock_1xY methods to improve performance
+    static inline void moveblock_1x1(const TIn *&in0, TOut *out)
+    {
+        for(unsigned int i = 0; i < IntBy; i++)
+        {
+            *out++ = static_cast<TOut>(*in0++);
+        }
+    }
+
+    static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out)
+    {
+        for(unsigned int i = 0; i < IntBy; i++)
+        {
+            *out++ = static_cast<TOut>(*in0++);
+        }
+        for(unsigned int i = 0; i < IntBy; i++)
+        {
+            *out++ = static_cast<TOut>(*in1++);
+        }
+    }
+
+    static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out)
+    {
+        for(unsigned int i = 0; i < IntBy; i++)
+        {
+            *out++ = static_cast<TOut>(*in0++);
+        }
+        for(unsigned int i = 0; i < IntBy; i++)
+        {
+            *out++ = static_cast<TOut>(*in1++);
+        }
+        for(unsigned int i = 0; i < IntBy; i++)
+        {
+            *out++ = static_cast<TOut>(*in2++);
+        }
+        for(unsigned int i = 0; i < IntBy; i++)
+        {
+            *out++ = static_cast<TOut>(*in3++);
+        }
+    }
+
+    static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax)
+    {
+        const auto ldin = stride;
+
+        TOut      *outarray    = out;
+        const TIn *inarray     = in;
+        TOut      *outptr_base = outarray;
+        const TIn *inptr_base  = inarray + x0 + (k0 * ldin);
+        int        ldout       = (kmax - k0) * IntBy;
+
+        int k = (kmax - k0);
+        for(; k > 3; k -= 4)
+        {
+            TOut      *outptr = outptr_base;
+            const TIn *inptr  = inptr_base;
+            const TIn *inptr1 = inptr + ldin;
+            const TIn *inptr2 = inptr1 + ldin;
+            const TIn *inptr3 = inptr2 + ldin;
+
+            prefetch_3x(inptr);
+            prefetch_3x(inptr1);
+            prefetch_3x(inptr2);
+            prefetch_3x(inptr3);
+
+            outptr_base += IntBy * 4;
+            inptr_base += ldin * 4;
+
+            for(int x = (xmax - x0) / IntBy; x > 0; x--)
+            {
+                moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr);
+                outptr += ldout;
+            }
+        }
+
+        if(k)
+        {
+            TOut      *outptr = outptr_base;
+            const TIn *inptr  = inptr_base;
+            const TIn *inptr1 = inptr + ldin;
+            const TIn *inptr2 = inptr1 + ldin;
+
+            prefetch_3x(inptr);
+            prefetch_3x(inptr1);
+            prefetch_3x(inptr2);
+
+            for(int x = (xmax - x0) / IntBy; x > 0; x--)
+            {
+                switch(k)
+                {
+                    case 3:
+                        moveblock_1x2(inptr, inptr1, outptr);
+                        moveblock_1x1(inptr2, outptr + IntBy * 2);
+                        break;
+
+                    case 2:
+                        moveblock_1x2(inptr, inptr1, outptr);
+                        break;
+
+                    case 1:
+                        moveblock_1x1(inptr, outptr);
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+
+                outptr += ldout;
+            }
+        }
+
+        // Cope with ragged X cases
+        const unsigned int overflow = (xmax - x0) % IntBy;
+        if(overflow)
+        {
+            const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin);
+            TOut      *outptr     = outarray + ((xmax - x0) / IntBy) * ldout;
+
+            for(int k = (kmax - k0); k > 0; k--)
+            {
+                const TIn *inptr = inptr_base;
+                inptr_base += ldin;
+
+                for(unsigned int x = 0; x < IntBy; x++)
+                {
+                    TOut val  = (x < overflow) ? static_cast<TOut>(*inptr++) : static_cast<TOut>(0);
+                    *outptr++ = val;
+                }
+            }
+        }
+    }
+};

diff --git a/src/graph/CL/CLUnmap.cpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
similarity index 66%
rename from src/graph/CL/CLUnmap.cpp
rename to src/core/NEON/kernels/arm_gemm/utils.hpp
index 31f2f19..6c5b92a 100644
--- a/src/graph/CL/CLUnmap.cpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,23 +21,31 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/CL/CLUnmap.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#pragma once
 
-using namespace arm_compute::graph;
+// Macro for unreachable code (e.g. impossible default cases on switch)
+#define UNREACHABLE(why) __builtin_unreachable()
 
-CLUnmap::CLUnmap(ITensorObject *tensor)
-    : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor()))
+// Paranoid option for the above with assert
+// #define UNREACHABLE(why)   assert(0 && why)
+
+inline int iceildiv(const int a, const int b)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
+    return (a + b - 1) / b;
 }
 
-void CLUnmap::run()
+template <typename T>
+inline T roundup(const T a, const T b)
 {
-    _tensor->unmap(arm_compute::CLScheduler::get().queue());
+    T rem = a % b;
+
+    if(rem)
+    {
+        return a + b - rem;
+    }
+    else
+    {
+        return a;
+    }
 }

diff --git a/src/core/NEON/kernels/convolution/common/utils.cpp b/src/core/NEON/kernels/convolution/common/utils.cpp
index 24d0386..45847bb 100644
--- a/src/core/NEON/kernels/convolution/common/utils.cpp
+++ b/src/core/NEON/kernels/convolution/common/utils.cpp

@@ -23,18 +23,6 @@
  */
 
 #include <cstdio>
-#include <ctime>
-
-double TimeInUs(void)
-{
-#ifdef CYCLE_PROFILING
-  timespec t;
-  clock_gettime(CLOCK_REALTIME, &t);
-  return 1e6*t.tv_sec + 1e-3*t.tv_nsec;
-#else
-  return 0;
-#endif
-}
 
 void PrintMatrix(const float* const m, const int M, const int N, const int row_stride)
 {
@@ -47,4 +35,4 @@
     printf("\n");
   }
   printf("\n");
-}
+}
\ No newline at end of file

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
index fa50f79..9b3a60d 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp

@@ -28,412 +28,543 @@
 using Conv = DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float>;
 using ConvImpl = DepthwiseConvolutionImpl<2, 2, 3, 3, 1, 1, float, float>;
 
+#ifdef __aarch64__
+
 template <>
-const Conv::TileFn Conv::tile_fns
-  [max_in_pad_top]
-  [max_in_pad_left]
-  [max_in_pad_bottom]
-  [max_in_pad_right]
-  [max_out_pad_bottom]
-  [max_out_pad_right] = {
-  {  // Input pad top = 0
-    {  // Input pad left = 0
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 0, 0, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 0, 0, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 0, 1, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 0, 1, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 0, 2, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 0, 2, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 1, 0, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 1, 0, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 1, 1, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 1, 1, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 1, 2, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 1, 2, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 2, 0, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 2, 0, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 2, 1, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 2, 1, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 2, 2, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 2, 2, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-      },  // Input pad bottom = 2
-    },  // Input pad left = 0
-    {  // Input pad left = 1
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 0, 0, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 0, 0, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 0, 1, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 0, 1, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 0, 2, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 0, 2, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 1, 0, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 1, 0, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 1, 1, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 1, 1, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 1, 2, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 1, 2, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 2, 0, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 2, 0, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 2, 1, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 2, 1, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 2, 2, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 2, 2, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-      },  // Input pad bottom = 2
-    },  // Input pad left = 1
-  },  // Input pad top = 0
-  {  // Input pad top = 1
-    {  // Input pad left = 0
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 0, 0, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 0, 0, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 0, 1, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 0, 1, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 0, 2, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 0, 2, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 1, 0, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 1, 0, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 1, 1, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 1, 1, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 1, 2, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 1, 2, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 2, 0, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 2, 0, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 2, 1, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 2, 1, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 2, 2, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 2, 2, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-      },  // Input pad bottom = 2
-    },  // Input pad left = 0
-    {  // Input pad left = 1
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 0, 0, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 0, 0, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 0, 1, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 0, 1, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 0, 2, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 0, 2, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 1, 0, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 1, 0, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 1, 1, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 1, 1, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 1, 2, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 1, 2, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 2, 0, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 2, 0, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 2, 1, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 2, 1, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 2, 2, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 2, 2, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-      },  // Input pad bottom = 2
-    },  // Input pad left = 1
-  },  // Input pad top = 1
+template <>
+void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
+  const int n_channels,
+  const float* const weights,
+  const int weight_row_stride,
+  const int weight_col_stride,
+  const float* const inptr,
+  const int in_row_stride,
+  const int in_col_stride,
+  float* const outptr,
+  const int out_row_stride,
+  const int out_col_stride,
+  const int, const int, const int, const int, const int, const int
+)
+{
+  // Copy pointers
+  const float *uptr0 = inptr;
+  const float *wptr0 = weights;
+  float *vptr0 = outptr;
+
+  int channels_remaining = n_channels;
+  if (channels_remaining >= 4)
+  {
+    // Process blocks of 4 channels at a time
+    int n_iters = ((channels_remaining / 4) + 1)/2 - 1;
+    const bool odd_tail = (channels_remaining / 4) & 1;
+    channels_remaining %= 4;
+
+    asm volatile (
+      "qW11B .req q0\n" "vW11B .req v0\n" "qW33A .req q1\n" "qU32B .req q1\n"
+      "vW33A .req v1\n" "vU32B .req v1\n" "qU44B .req q2\n" "qW21A .req q2\n"
+      "vU44B .req v2\n" "vW21A .req v2\n" "qU21B .req q3\n" "qU32A .req q3\n"
+      "vU21B .req v3\n" "vU32A .req v3\n" "qU43A .req q4\n" "qV21B .req q4\n"
+      "vU43A .req v4\n" "vV21B .req v4\n" "qU24A .req q5\n" "qU44A .req q5\n"
+      "qU33B .req q5\n" "vU24A .req v5\n" "vU44A .req v5\n" "vU33B .req v5\n"
+      "qU31A .req q6\n" "qV12B .req q6\n" "qU23A .req q6\n" "vU31A .req v6\n"
+      "vV12B .req v6\n" "vU23A .req v6\n" "qW31B .req q7\n" "qV22A .req q7\n"
+      "vW31B .req v7\n" "vV22A .req v7\n" "qV12A .req q8\n" "qW21B .req q8\n"
+      "vV12A .req v8\n" "vW21B .req v8\n" "qU22B .req q9\n" "qU34A .req q9\n"
+      "vU22B .req v9\n" "vU34A .req v9\n" "qU13B .req q10\n" "qU13A .req q10\n"
+      "vU13B .req v10\n" "vU13A .req v10\n" "qU34B .req q11\n" "qU22A .req q11\n"
+      "vU34B .req v11\n" "vU22A .req v11\n" "qU24B .req q12\n" "qU31B .req q12\n"
+      "vU24B .req v12\n" "vU31B .req v12\n" "qW12B .req q13\n" "qW13A .req q13\n"
+      "vW12B .req v13\n" "vW13A .req v13\n" "qV21A .req q14\n" "qV11B .req q14\n"
+      "vV21A .req v14\n" "vV11B .req v14\n" "qW32A .req q15\n" "qW32B .req q15\n"
+      "vW32A .req v15\n" "vW32B .req v15\n" "qW31A .req q16\n" "qV22B .req q16\n"
+      "vW31A .req v16\n" "vV22B .req v16\n"
+      "qW11A .req q17\n" "vW11A .req v17\n" "qW13B .req q18\n" "qU14A .req q18\n"
+      "vW13B .req v18\n" "vU14A .req v18\n" "qU33A .req q19\n" "qW33B .req q19\n"
+      "vU33A .req v19\n" "vW33B .req v19\n" "qW22A .req q20\n" "qU23B .req q20\n"
+      "vW22A .req v20\n" "vU23B .req v20\n" "qU12A .req q21\n" "qU42A .req q21\n"
+      "vU12A .req v21\n" "vU42A .req v21\n" "qU41A .req q22\n" "qU42B .req q22\n"
+      "vU41A .req v22\n" "vU42B .req v22\n" "qW23A .req q23\n" "qW23B .req q23\n"
+      "vW23A .req v23\n" "vW23B .req v23\n" "qU43B .req q24\n" "qU11A .req q24\n"
+      "vU43B .req v24\n" "vU11A .req v24\n" "qU12B .req q25\n" "qW12A .req q25\n"
+      "vU12B .req v25\n" "vW12A .req v25\n" "qU41B .req q26\n" "qV11A .req q26\n"
+      "vU41B .req v26\n" "vV11A .req v26\n" "qW22B .req q27\n" "vW22B .req v27\n"
+      "qU11B .req q28\n" "qU14B .req q28\n" "vU11B .req v28\n" "vU14B .req v28\n"
+      "qU21A .req q29\n" "vU21A .req v29\n"
+
+      "u_col_stride1 .req %x[u_col_stride]\n"
+      "u_col_stride2 .req x0\n"
+      "u_col_stride3 .req x1\n"
+      "uptr1 .req x2\n"
+      "uptr2 .req x3\n"
+      "uptr3 .req x4\n"
+      "wptr1 .req x5\n"
+      "wptr2 .req x6\n"
+      "vptr1 .req x7\n"
+      "w_col_stride1 .req %x[w_col_stride]\n"
+      "w_col_stride2 .req x8\n"
+
+      // Prepare strides and pointers
+      "add uptr1, %x[uptr0], %x[u_row_stride]\n"
+      "add uptr2,    uptr1 , %x[u_row_stride]\n"
+      "add uptr3,    uptr2 , %x[u_row_stride]\n"
+      "add wptr1, %x[wptr0], %x[w_row_stride]\n"
+      "add wptr2,    wptr1 , %x[w_row_stride]\n"
+      "add vptr1, %x[vptr0], %x[v_row_stride]\n"
+      "add u_col_stride2, %x[u_col_stride], %x[u_col_stride]\n"
+      "add u_col_stride3,    u_col_stride2 , %x[u_col_stride]\n"
+      "add w_col_stride2, %x[w_col_stride], %x[w_col_stride]\n"
+
+      // Load in preparation for execution
+      "ldr qU14A, [%x[uptr0], u_col_stride3]\n"
+      "ldr qW13A, [%x[wptr0], w_col_stride2]\n"
+      "ldr qU13A, [%x[uptr0], u_col_stride2]\n"
+      "ldr qW12A, [%x[wptr0], w_col_stride1]\n"
+      "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
+      "ldr qW11A, [%x[wptr0]], #0x10\n"
+      "ldr qU24A, [uptr1, u_col_stride3]\n"
+      "ldr qW23A, [wptr1, w_col_stride2]\n"
+      "ldr qU23A, [uptr1, u_col_stride2]\n"
+      "ldr qW22A, [wptr1, w_col_stride1]\n"
+      "ldr qU22A, [uptr1, u_col_stride1]\n"
+      "ldr qW21A, [wptr1], #0x10\n"
+      "ldr qU34A, [uptr2, u_col_stride3]\n"
+      "ldr qW33A, [wptr2, w_col_stride2]\n"
+      "ldr qU33A, [uptr2, u_col_stride2]\n"
+      "ldr qW32A, [wptr2, w_col_stride1]\n"
+      "ldr qU32A, [uptr2, u_col_stride1]\n"
+      "ldr qW31A, [wptr2], #0x10\n"
+      "fmul vV12A.4s, vU14A.4s, vW13A.4s\n"
+      "cbz %x[iters], 2f\n"  // Jump to tail if doing zero iterations of loop
+
+      "1:"  // Main loop body
+        // A part
+        "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
+        "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
+        "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
+        "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
+        "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
+        "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
+        "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
+        "ldr qU44A, [uptr3, u_col_stride3]\n"
+        "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
+        "ldr qU43A, [uptr3, u_col_stride2]\n"
+        "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
+        "ldr qU42A, [uptr3, u_col_stride1]\n"
+        "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
+        "ldr qU11A, [%x[uptr0]], #0x10\n"
+        "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
+        "ldr qU21A, [uptr1], #0x10\n"
+        "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
+        "ldr qU31A, [uptr2], #0x10\n"
+        "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
+        "ldr qU41A, [uptr3], #0x10\n"
+        "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
+        "ldr qU14B, [%x[uptr0], u_col_stride3]\n"
+        "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
+        "ldr qW13B, [%x[wptr0], w_col_stride2]\n"
+        "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
+        "ldr qU13B, [%x[uptr0], u_col_stride2]\n"
+        "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
+        "ldr qW12B, [%x[wptr0], w_col_stride1]\n"
+        "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
+        "ldr qU12B, [%x[uptr0], u_col_stride1]\n"
+        "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
+        "ldr qW11B, [%x[wptr0]], #0x10\n"
+        "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
+        "ldr qU24B, [uptr1, u_col_stride3]\n"
+        "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
+        "ldr qW23B, [wptr1, w_col_stride2]\n"
+        "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
+        "str qV12A, [%x[vptr0], %x[v_col_stride]]\n"
+        "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
+        "ldr qU23B, [uptr1, u_col_stride2]\n"
+        "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
+        "ldr qW22B, [wptr1, w_col_stride1]\n"
+        "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
+        "ldr qU22B, [uptr1, u_col_stride1]\n"
+        "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
+        "ldr qW21B, [wptr1], #0x10\n"
+        "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
+        "ldr qU34B, [uptr2, u_col_stride3]\n"
+        "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
+        "ldr qW33B, [wptr2, w_col_stride2]\n"
+        "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
+        "str qV22A, [vptr1, %x[v_col_stride]]\n"
+        "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
+        "ldr qU33B, [uptr2, u_col_stride2]\n"
+        "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
+        "ldr qW32B, [wptr2, w_col_stride1]\n"
+        "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
+        "ldr qU32B, [uptr2, u_col_stride1]\n"
+        "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
+        "str qV11A, [%x[vptr0]], #0x10\n"
+        "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
+        "ldr qW31B, [wptr2], #0x10\n"
+        "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
+        "str qV21A, [vptr1], #0x10\n"
+
+        // B part
+        "fmul vV12B.4s, vU14B.4s, vW13B.4s\n"
+        "fmul vV11B.4s, vU13B.4s, vW13B.4s\n"
+        "fmla vV12B.4s, vU13B.4s, vW12B.4s\n"
+        "fmla vV11B.4s, vU12B.4s, vW12B.4s\n"
+        "fmla vV12B.4s, vU12B.4s, vW11B.4s\n"
+        "fmla vV12B.4s, vU24B.4s, vW23B.4s\n"
+        "fmul vV22B.4s, vU24B.4s, vW13B.4s\n"
+        "subs %x[iters], %x[iters], #1\n"
+        "fmla vV11B.4s, vU23B.4s, vW23B.4s\n"
+        "ldr qU44B, [uptr3, u_col_stride3]\n"
+        "fmla vV12B.4s, vU23B.4s, vW22B.4s\n"
+        "ldr qU43B, [uptr3, u_col_stride2]\n"
+        "fmul vV21B.4s, vU23B.4s, vW13B.4s\n"
+        "ldr qU42B, [uptr3, u_col_stride1]\n"
+        "fmla vV22B.4s, vU23B.4s, vW12B.4s\n"
+        "ldr qU11B, [%x[uptr0]], #0x10\n"
+        "fmla vV11B.4s, vU22B.4s, vW22B.4s\n"
+        "ldr qU21B, [uptr1], #0x10\n"
+        "fmla vV12B.4s, vU22B.4s, vW21B.4s\n"
+        "ldr qU31B, [uptr2], #0x10\n"
+        "fmla vV21B.4s, vU22B.4s, vW12B.4s\n"
+        "ldr qU41B, [uptr3], #0x10\n"
+        "fmla vV22B.4s, vU22B.4s, vW11B.4s\n"
+        "ldr qU14A, [%x[uptr0], u_col_stride3]\n"
+        "fmla vV12B.4s, vU34B.4s, vW33B.4s\n"
+        "ldr qW13A, [%x[wptr0], w_col_stride2]\n"
+        "fmla vV22B.4s, vU34B.4s, vW23B.4s\n"
+        "ldr qU13A, [%x[uptr0], u_col_stride2]\n"
+        "fmla vV11B.4s, vU33B.4s, vW33B.4s\n"
+        "ldr qW12A, [%x[wptr0], w_col_stride1]\n"
+        "fmla vV12B.4s, vU33B.4s, vW32B.4s\n"
+        "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
+        "fmla vV21B.4s, vU33B.4s, vW23B.4s\n"
+        "ldr qW11A, [%x[wptr0]], #0x10\n"
+        "fmla vV22B.4s, vU33B.4s, vW22B.4s\n"
+        "ldr qU24A, [uptr1, u_col_stride3]\n"
+        "fmla vV11B.4s, vU32B.4s, vW32B.4s\n"
+        "ldr qW23A, [wptr1, w_col_stride2]\n"
+        "fmla vV12B.4s, vU32B.4s, vW31B.4s\n"
+        "str qV12B, [%x[vptr0], %x[v_col_stride]]\n"
+        "fmla vV21B.4s, vU32B.4s, vW22B.4s\n"
+        "ldr qU23A, [uptr1, u_col_stride2]\n"
+        "fmla vV22B.4s, vU32B.4s, vW21B.4s\n"
+        "ldr qW22A, [wptr1, w_col_stride1]\n"
+        "fmla vV22B.4s, vU44B.4s, vW33B.4s\n"
+        "ldr qU22A, [uptr1, u_col_stride1]\n"
+        "fmla vV21B.4s, vU43B.4s, vW33B.4s\n"
+        "ldr qW21A, [wptr1], #0x10\n"
+        "fmla vV22B.4s, vU43B.4s, vW32B.4s\n"
+        "ldr qU34A, [uptr2, u_col_stride3]\n"
+        "fmla vV21B.4s, vU42B.4s, vW32B.4s\n"
+        "ldr qW33A, [wptr2, w_col_stride2]\n"
+        "fmla vV22B.4s, vU42B.4s, vW31B.4s\n"
+        "str qV22B, [vptr1, %x[v_col_stride]]\n"
+        "fmla vV11B.4s, vU11B.4s, vW11B.4s\n"
+        "ldr qU33A, [uptr2, u_col_stride2]\n"
+        "fmla vV11B.4s, vU21B.4s, vW21B.4s\n"
+        "ldr qW32A, [wptr2, w_col_stride1]\n"
+        "fmla vV21B.4s, vU21B.4s, vW11B.4s\n"
+        "ldr qU32A, [uptr2, u_col_stride1]\n"
+        "fmla vV11B.4s, vU31B.4s, vW31B.4s\n"
+        "str qV11B, [%x[vptr0]], #0x10\n"
+        "fmla vV21B.4s, vU31B.4s, vW21B.4s\n"
+        "ldr qW31A, [wptr2], #0x10\n"
+        "fmla vV21B.4s, vU41B.4s, vW31B.4s\n"
+        "str qV21B, [vptr1], #0x10\n"
+        "fmul vV12A.4s, vU14A.4s, vW13A.4s\n"
+        "bne 1b\n"  // Loop
+
+      "2:"  // Branch destination for zero loops
+        "cbnz %w[odd_tail], 4f\n"
+
+      "3:"  // Even number of iterations
+        "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
+        "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
+        "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
+        "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
+        "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
+        "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
+        "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
+        "ldr qU44A, [uptr3, u_col_stride3]\n"
+        "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
+        "ldr qU43A, [uptr3, u_col_stride2]\n"
+        "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
+        "ldr qU42A, [uptr3, u_col_stride1]\n"
+        "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
+        "ldr qU11A, [%x[uptr0]], #0x10\n"
+        "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
+        "ldr qU21A, [uptr1], #0x10\n"
+        "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
+        "ldr qU31A, [uptr2], #0x10\n"
+        "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
+        "ldr qU41A, [uptr3], #0x10\n"
+        "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
+        "ldr qU14B, [%x[uptr0], u_col_stride3]\n"
+        "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
+        "ldr qW13B, [%x[wptr0], w_col_stride2]\n"
+        "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
+        "ldr qU13B, [%x[uptr0], u_col_stride2]\n"
+        "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
+        "ldr qW12B, [%x[wptr0], w_col_stride1]\n"
+        "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
+        "ldr qU12B, [%x[uptr0], u_col_stride1]\n"
+        "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
+        "ldr qW11B, [%x[wptr0]], #0x10\n"
+        "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
+        "ldr qU24B, [uptr1, u_col_stride3]\n"
+        "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
+        "ldr qW23B, [wptr1, w_col_stride2]\n"
+        "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
+        "str qV12A, [%x[vptr0], %x[v_col_stride]]\n"
+        "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
+        "ldr qU23B, [uptr1, u_col_stride2]\n"
+        "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
+        "ldr qW22B, [wptr1, w_col_stride1]\n"
+        "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
+        "ldr qU22B, [uptr1, u_col_stride1]\n"
+        "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
+        "ldr qW21B, [wptr1], #0x10\n"
+        "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
+        "ldr qU34B, [uptr2, u_col_stride3]\n"
+        "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
+        "ldr qW33B, [wptr2, w_col_stride2]\n"
+        "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
+        "str qV22A, [vptr1, %x[v_col_stride]]\n"
+        "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
+        "ldr qU33B, [uptr2, u_col_stride2]\n"
+        "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
+        "ldr qW32B, [wptr2, w_col_stride1]\n"
+        "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
+        "ldr qU32B, [uptr2, u_col_stride1]\n"
+        "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
+        "str qV11A, [%x[vptr0]], #0x10\n"
+        "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
+        "ldr qW31B, [wptr2], #0x10\n"
+        "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
+        "str qV21A, [vptr1], #0x10\n"
+
+        "fmul vV12B.4s, vU14B.4s, vW13B.4s\n"
+        "fmul vV11B.4s, vU13B.4s, vW13B.4s\n"
+        "fmla vV12B.4s, vU13B.4s, vW12B.4s\n"
+        "fmla vV11B.4s, vU12B.4s, vW12B.4s\n"
+        "fmla vV12B.4s, vU12B.4s, vW11B.4s\n"
+        "fmla vV12B.4s, vU24B.4s, vW23B.4s\n"
+        "fmul vV22B.4s, vU24B.4s, vW13B.4s\n"
+        "fmla vV11B.4s, vU23B.4s, vW23B.4s\n"
+        "ldr qU44B, [uptr3, u_col_stride3]\n"
+        "fmla vV12B.4s, vU23B.4s, vW22B.4s\n"
+        "ldr qU43B, [uptr3, u_col_stride2]\n"
+        "fmul vV21B.4s, vU23B.4s, vW13B.4s\n"
+        "ldr qU42B, [uptr3, u_col_stride1]\n"
+        "fmla vV22B.4s, vU23B.4s, vW12B.4s\n"
+        "ldr qU11B, [%x[uptr0]], #0x10\n"
+        "fmla vV11B.4s, vU22B.4s, vW22B.4s\n"
+        "ldr qU21B, [uptr1], #0x10\n"
+        "fmla vV12B.4s, vU22B.4s, vW21B.4s\n"
+        "ldr qU31B, [uptr2], #0x10\n"
+        "fmla vV21B.4s, vU22B.4s, vW12B.4s\n"
+        "ldr qU41B, [uptr3], #0x10\n"
+        "fmla vV22B.4s, vU22B.4s, vW11B.4s\n"
+        "fmla vV12B.4s, vU34B.4s, vW33B.4s\n"
+        "fmla vV22B.4s, vU34B.4s, vW23B.4s\n"
+        "fmla vV11B.4s, vU33B.4s, vW33B.4s\n"
+        "fmla vV12B.4s, vU33B.4s, vW32B.4s\n"
+        "fmla vV21B.4s, vU33B.4s, vW23B.4s\n"
+        "fmla vV22B.4s, vU33B.4s, vW22B.4s\n"
+        "fmla vV11B.4s, vU32B.4s, vW32B.4s\n"
+        "fmla vV12B.4s, vU32B.4s, vW31B.4s\n"
+        "str qV12B, [%x[vptr0], %x[v_col_stride]]\n"
+        "fmla vV21B.4s, vU32B.4s, vW22B.4s\n"
+        "fmla vV22B.4s, vU32B.4s, vW21B.4s\n"
+        "fmla vV22B.4s, vU44B.4s, vW33B.4s\n"
+        "fmla vV21B.4s, vU43B.4s, vW33B.4s\n"
+        "fmla vV22B.4s, vU43B.4s, vW32B.4s\n"
+        "fmla vV21B.4s, vU42B.4s, vW32B.4s\n"
+        "fmla vV22B.4s, vU42B.4s, vW31B.4s\n"
+        "str qV22B, [vptr1, %x[v_col_stride]]\n"
+        "fmla vV11B.4s, vU11B.4s, vW11B.4s\n"
+        "fmla vV11B.4s, vU21B.4s, vW21B.4s\n"
+        "fmla vV21B.4s, vU21B.4s, vW11B.4s\n"
+        "fmla vV11B.4s, vU31B.4s, vW31B.4s\n"
+        "str qV11B, [%x[vptr0]], #0x10\n"
+        "fmla vV21B.4s, vU31B.4s, vW21B.4s\n"
+        "fmla vV21B.4s, vU41B.4s, vW31B.4s\n"
+        "str qV21B, [vptr1], #0x10\n"
+        "b 5f\n"
+
+      "4:"  // Odd number of iterations
+        "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
+        "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
+        "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
+        "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
+        "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
+        "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
+        "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
+        "ldr qU44A, [uptr3, u_col_stride3]\n"
+        "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
+        "ldr qU43A, [uptr3, u_col_stride2]\n"
+        "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
+        "ldr qU42A, [uptr3, u_col_stride1]\n"
+        "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
+        "ldr qU11A, [%x[uptr0]], #0x10\n"
+        "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
+        "ldr qU21A, [uptr1], #0x10\n"
+        "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
+        "ldr qU31A, [uptr2], #0x10\n"
+        "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
+        "ldr qU41A, [uptr3], #0x10\n"
+        "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
+        "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
+        "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
+        "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
+        "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
+        "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
+        "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
+        "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
+        "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
+        "str qV12A, [%x[vptr0], %x[v_col_stride]]\n"
+        "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
+        "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
+        "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
+        "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
+        "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
+        "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
+        "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
+        "str qV22A, [vptr1, %x[v_col_stride]]\n"
+        "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
+        "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
+        "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
+        "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
+        "str qV11A, [%x[vptr0]], #0x10\n"
+        "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
+        "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
+        "str qV21A, [vptr1], #0x10\n"
+
+      "5:"  // End of method
+
+      ".unreq qW11B\n" ".unreq qW33A\n" ".unreq qU32B\n"
+      ".unreq qU44B\n" ".unreq qW21A\n" ".unreq qU21B\n" ".unreq qU32A\n"
+      ".unreq qU43A\n" ".unreq qV21B\n"
+      ".unreq qU24A\n" ".unreq qU44A\n" ".unreq qU33B\n"
+      ".unreq qU31A\n" ".unreq qV12B\n" ".unreq qU23A\n"
+      ".unreq qW31B\n" ".unreq qV22A\n" ".unreq qV12A\n" ".unreq qW21B\n"
+      ".unreq qU22B\n" ".unreq qU34A\n" ".unreq qU13B\n" ".unreq qU13A\n"
+      ".unreq qU34B\n" ".unreq qU22A\n" ".unreq qU24B\n" ".unreq qU31B\n"
+      ".unreq qW12B\n" ".unreq qW13A\n" ".unreq qV21A\n" ".unreq qV11B\n"
+      ".unreq qW32A\n" ".unreq qW32B\n" ".unreq qW31A\n" ".unreq qV22B\n"
+      ".unreq qW11A\n" ".unreq qW13B\n" ".unreq qU14A\n"
+      ".unreq qU33A\n" ".unreq qW33B\n" ".unreq qW22A\n" ".unreq qU23B\n"
+      ".unreq qU12A\n" ".unreq qU42A\n" ".unreq qU41A\n" ".unreq qU42B\n"
+      ".unreq qW23A\n" ".unreq qW23B\n" ".unreq qU43B\n" ".unreq qU11A\n"
+      ".unreq qU12B\n" ".unreq qW12A\n" ".unreq qU41B\n" ".unreq qV11A\n"
+      ".unreq qW22B\n" ".unreq qU11B\n" ".unreq qU14B\n" ".unreq qU21A\n"
+      ".unreq vW11B\n" ".unreq vW33A\n" ".unreq vU32B\n"
+      ".unreq vU44B\n" ".unreq vW21A\n" ".unreq vU21B\n" ".unreq vU32A\n"
+      ".unreq vU43A\n" ".unreq vV21B\n"
+      ".unreq vU24A\n" ".unreq vU44A\n" ".unreq vU33B\n"
+      ".unreq vU31A\n" ".unreq vV12B\n" ".unreq vU23A\n"
+      ".unreq vW31B\n" ".unreq vV22A\n" ".unreq vV12A\n" ".unreq vW21B\n"
+      ".unreq vU22B\n" ".unreq vU34A\n" ".unreq vU13B\n" ".unreq vU13A\n"
+      ".unreq vU34B\n" ".unreq vU22A\n" ".unreq vU24B\n" ".unreq vU31B\n"
+      ".unreq vW12B\n" ".unreq vW13A\n" ".unreq vV21A\n" ".unreq vV11B\n"
+      ".unreq vW32A\n" ".unreq vW32B\n" ".unreq vW31A\n" ".unreq vV22B\n"
+      ".unreq vW11A\n" ".unreq vW13B\n" ".unreq vU14A\n"
+      ".unreq vU33A\n" ".unreq vW33B\n" ".unreq vW22A\n" ".unreq vU23B\n"
+      ".unreq vU12A\n" ".unreq vU42A\n" ".unreq vU41A\n" ".unreq vU42B\n"
+      ".unreq vW23A\n" ".unreq vW23B\n" ".unreq vU43B\n" ".unreq vU11A\n"
+      ".unreq vU12B\n" ".unreq vW12A\n" ".unreq vU41B\n" ".unreq vV11A\n"
+      ".unreq vW22B\n" ".unreq vU11B\n" ".unreq vU14B\n" ".unreq vU21A\n"
+      ".unreq u_col_stride1\n" ".unreq u_col_stride2\n"
+      ".unreq u_col_stride3\n"
+      ".unreq uptr1\n" ".unreq uptr2\n" ".unreq uptr3\n"
+      ".unreq wptr1\n" ".unreq wptr2\n" ".unreq vptr1\n"
+      ".unreq w_col_stride1\n" ".unreq w_col_stride2\n"
+
+      : [uptr0] "+r" (uptr0), [vptr0] "+r" (vptr0), [wptr0] "+r" (wptr0),
+        [iters] "+r" (n_iters)
+      : [u_row_stride] "r" (in_row_stride * sizeof(float)),
+        [u_col_stride] "r" (in_col_stride * sizeof(float)),
+        [v_row_stride] "r" (out_row_stride * sizeof(float)),
+        [v_col_stride] "r" (out_col_stride * sizeof(float)),
+        [w_row_stride] "r" (weight_row_stride * sizeof(float)),
+        [w_col_stride] "r" (weight_col_stride * sizeof(float)),
+        [odd_tail] "r" (odd_tail)
+      : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "cc",
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "memory"
+    );
+  }
+
+  if (channels_remaining)
+  {
+    // Fall back on the unoptimised version to clean up the tail
+    ConvImpl::process_tile<false>(
+        channels_remaining,
+        wptr0, weight_row_stride, weight_col_stride,
+        uptr0, in_row_stride, in_col_stride,
+        vptr0, out_row_stride, out_col_stride,
+        0, 0, 0, 0, 0, 0
+    );
+  }
+}
+
+#endif  // __aarch64__
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+  ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
 };
 
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+  ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+  },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+  },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
 
 template class DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float>;
 }  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
index 0ec5a77..dba2330 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp

@@ -29,1067 +29,70 @@
 using ConvImpl = DepthwiseConvolutionImpl<2, 2, 3, 3, 2, 2, float, float>;
 
 template <>
-const Conv::TileFn Conv::tile_fns
-  [max_in_pad_top]
-  [max_in_pad_left]
-  [max_in_pad_bottom]
-  [max_in_pad_right]
-  [max_out_pad_bottom]
-  [max_out_pad_right] = {
-  {  // Input pad top = 0
-    {  // Input pad left = 0
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 3
-      {  // Input pad bottom = 4
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 4
-    },  // Input pad left = 0
-    {  // Input pad left = 1
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 3
-      {  // Input pad bottom = 4
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 4
-    },  // Input pad left = 1
-  },  // Input pad top = 0
-  {  // Input pad top = 1
-    {  // Input pad left = 0
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 3
-      {  // Input pad bottom = 4
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 4
-    },  // Input pad left = 0
-    {  // Input pad left = 1
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 3
-      {  // Input pad bottom = 4
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 0, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 0, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 1, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 1, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 2, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 2, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 3, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 3, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 4, 0, 1>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 4, 1, 1>,
-          },  // Output pad bottom = 1
-        },  // Input pad right = 4
-      },  // Input pad bottom = 4
-    },  // Input pad left = 1
-  },  // Input pad top = 1
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+  ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+  ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
 };
 
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+  ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+  ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+  },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+  },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
 
 template class DepthwiseConvolution<2, 2, 3, 3, 2, 2, float, float>;
 }  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
index dc3c383..b946e5d 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp

@@ -28,1148 +28,928 @@
 using Conv = DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float>;
 using ConvImpl = DepthwiseConvolutionImpl<3, 3, 3, 3, 1, 1, float, float>;
 
+#ifdef __aarch64__
+
 template <>
-const Conv::TileFn Conv::tile_fns
-  [max_in_pad_top]
-  [max_in_pad_left]
-  [max_in_pad_bottom]
-  [max_in_pad_right]
-  [max_out_pad_bottom]
-  [max_out_pad_right] = {
-  {  // Input pad top = 0
-    {  // Input pad left = 0
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-      },  // Input pad bottom = 3
-    },  // Input pad left = 0
-    {  // Input pad left = 1
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-      },  // Input pad bottom = 3
-    },  // Input pad left = 1
-  },  // Input pad top = 0
-  {  // Input pad top = 1
-    {  // Input pad left = 0
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-      },  // Input pad bottom = 3
-    },  // Input pad left = 0
-    {  // Input pad left = 1
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-      },  // Input pad bottom = 3
-    },  // Input pad left = 1
-  },  // Input pad top = 1
+template <>
+void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
+  const int n_channels,
+  const float* const weights,
+  const int weight_row_stride,
+  const int weight_col_stride,
+  const float* const inptr,
+  const int in_row_stride,
+  const int in_col_stride,
+  float* const outptr,
+  const int out_row_stride,
+  const int out_col_stride,
+  const int, const int, const int, const int, const int, const int
+)
+{
+  // Copy pointers
+  const float *uptr0 = inptr;
+  const float *wptr0 = weights;
+  float *vptr0 = outptr;
+
+  int channels_remaining = n_channels;
+  if (channels_remaining >= 4)
+  {
+    // Process blocks of 4 channels at a time
+    int n_iters = ((channels_remaining / 4) + 1)/2 - 1;
+    const bool odd_tail = (channels_remaining / 4) & 1;
+    channels_remaining %= 4;
+
+    asm volatile (
+        "qU22B .req q0\n" "qU23B .req q0\n" "qW22A .req q0\n"
+        "vU22B .req v0\n" "vU23B .req v0\n" "vW22A .req v0\n"
+        "qV12A .req q1\n" "qW11B .req q1\n"
+        "vV12A .req v1\n" "vW11B .req v1\n"
+        "qU41A .req q2\n" "qU32B .req q2\n" "qU33A .req q2\n" "qV13B .req q2\n"
+        "vU41A .req v2\n" "vU32B .req v2\n" "vU33A .req v2\n" "vV13B .req v2\n"
+        "qU42B .req q3\n" "qU13B .req q3\n" "qU44B .req q3\n" "qU55A .req q3\n"
+        "vU42B .req v3\n" "vU13B .req v3\n" "vU44B .req v3\n" "vU55A .req v3\n"
+        "qU34B .req q4\n" "qU15A .req q4\n" "qU42A .req q4\n" "qU44A .req q4\n" "qU12B .req q4\n"
+        "vU34B .req v4\n" "vU15A .req v4\n" "vU42A .req v4\n" "vU44A .req v4\n" "vU12B .req v4\n"
+        "qU33B .req q5\n" "qU52A .req q5\n" "qW23A .req q5\n"
+        "vU33B .req v5\n" "vU52A .req v5\n" "vW23A .req v5\n"
+        "qV31A .req q6\n" "qU13A .req q6\n" "qV12B .req q6\n"
+        "vV31A .req v6\n" "vU13A .req v6\n" "vV12B .req v6\n"
+        "qU35B .req q7\n" "qU51B .req q7\n" "qV11A .req q7\n" "qU53B .req q7\n"
+        "vU35B .req v7\n" "vU51B .req v7\n" "vV11A .req v7\n" "vU53B .req v7\n"
+        "qW21A .req q8\n" "qV22B .req q8\n"
+        "vW21A .req v8\n" "vV22B .req v8\n"
+        "qV33B .req q9\n" "qU14A .req q9\n" "qV23A .req q9\n" "qU25B .req q9\n"
+        "vV33B .req v9\n" "vU14A .req v9\n" "vV23A .req v9\n" "vU25B .req v9\n"
+        "qW21B .req q10\n" "qV32A .req q10\n" "qU35A .req q10\n"
+        "vW21B .req v10\n" "vV32A .req v10\n" "vU35A .req v10\n"
+        "qV11B .req q11\n" "qU15B .req q11\n" "qV33A .req q11\n"
+        "vV11B .req v11\n" "vU15B .req v11\n" "vV33A .req v11\n"
+        "qU11B .req q12\n" "qW23B .req q12\n" "qU45A .req q12\n"
+        "vU11B .req v12\n" "vW23B .req v12\n" "vU45A .req v12\n"
+        "qW11A .req q13\n" "qU45B .req q13\n" "qU52B .req q13\n"
+        "vW11A .req v13\n" "vU45B .req v13\n" "vU52B .req v13\n"
+        "qU55B .req q14\n" "qU25A .req q14\n" "qV21A .req q14\n"
+        "vU55B .req v14\n" "vU25A .req v14\n" "vV21A .req v14\n"
+        "qU53A .req q15\n" "qV21B .req q15\n" "qU31A .req q15\n"
+        "vU53A .req v15\n" "vV21B .req v15\n" "vU31A .req v15\n"
+        "qW13B .req q16\n" "qU23A .req q16\n"
+        "vW13B .req v16\n" "vU23A .req v16\n"
+        "qW33B .req q17\n" "qW33A .req q17\n"
+        "vW33B .req v17\n" "vW33A .req v17\n"
+        "qU24B .req q18\n" "qU32A .req q18\n" "qV31B .req q18\n" "qV13A .req q18\n"
+        "vU24B .req v18\n" "vU32A .req v18\n" "vV31B .req v18\n" "vV13A .req v18\n"
+        "qU31B .req q19\n" "qU11A .req q19\n" "qU54B .req q19\n" "qU43A .req q19\n"
+        "vU31B .req v19\n" "vU11A .req v19\n" "vU54B .req v19\n" "vU43A .req v19\n"
+        "qU24A .req q20\n" "qW12B .req q20\n" "qU54A .req q20\n"
+        "vU24A .req v20\n" "vW12B .req v20\n" "vU54A .req v20\n"
+        "qV23B .req q21\n" "qW12A .req q21\n"
+        "vV23B .req v21\n" "vW12A .req v21\n"
+        "qW32A .req q22\n" "qU43B .req q22\n"
+        "vW32A .req v22\n" "vU43B .req v22\n"
+        "qW31A .req q23\n" "qV32B .req q23\n"
+        "vW31A .req v23\n" "vV32B .req v23\n"
+        "qU22A .req q24\n" "qW31B .req q24\n"
+        "vU22A .req v24\n" "vW31B .req v24\n"
+        "qU21B .req q25\n" "qV22A .req q25\n"
+        "vU21B .req v25\n" "vV22A .req v25\n"
+        "qU34A .req q26\n" "qW22B .req q26\n" "qU12A .req q26\n"
+        "vU34A .req v26\n" "vW22B .req v26\n" "vU12A .req v26\n"
+        "qW13A .req q27\n" "qU51A .req q27\n"
+        "vW13A .req v27\n" "vU51A .req v27\n"
+        "qW32B .req q28\n"
+        "vW32B .req v28\n"
+        "qU41B .req q29\n" "qU14B .req q29\n"
+        "vU41B .req v29\n" "vU14B .req v29\n"
+        "qU21A .req q30\n"
+        "vU21A .req v30\n"
+
+        "uptr1 .req x0\n"
+        "uptr2 .req x1\n"
+        "uptr3 .req x2\n"
+        "uptr4 .req x3\n"
+
+        "u_col_stride1 .req %x[u_col_stride]\n"
+        "u_col_stride2 .req x4\n"
+        "u_col_stride3 .req x5\n"
+        "u_col_stride4 .req x6\n"
+
+        "wptr1 .req x7\n"
+        "wptr2 .req x8\n"
+        "w_col_stride1 .req %x[w_col_stride]\n"
+        "w_col_stride2 .req x9\n"
+
+        "vptr1 .req x10\n"
+        "vptr2 .req x11\n"
+        "v_col_stride1 .req %x[v_col_stride]\n"
+        "v_col_stride2 .req x12\n"
+
+        // Prepare strides and pointers
+        "add uptr1, %x[uptr0], %x[u_row_stride]\n"
+        "add uptr2,    uptr1 , %x[u_row_stride]\n"
+        "add uptr3,    uptr2 , %x[u_row_stride]\n"
+        "add uptr4,    uptr3 , %x[u_row_stride]\n"
+        "add u_col_stride2, u_col_stride1, u_col_stride1\n"
+        "add u_col_stride3, u_col_stride2, u_col_stride1\n"
+        "add u_col_stride4, u_col_stride3, u_col_stride1\n"
+
+        "add wptr1, %x[wptr0], %x[w_row_stride]\n"
+        "add wptr2,    wptr1 , %x[w_row_stride]\n"
+        "add w_col_stride2, w_col_stride1, w_col_stride1\n"
+
+        "add vptr1, %x[vptr0], %x[v_row_stride]\n"
+        "add vptr2,    vptr1 , %x[v_row_stride]\n"
+        "add v_col_stride2, v_col_stride1, v_col_stride1\n"
+
+        // Pre-load for A
+        "ldr qW13A, [%x[wptr0], w_col_stride2]\n"
+        "ldr qW23A, [wptr1, w_col_stride2]\n"
+        "ldr qW33A, [wptr2, w_col_stride2]\n"
+        "ldr qW12A, [%x[wptr0], w_col_stride1]\n"
+        "ldr qU15A, [%x[uptr0], u_col_stride4]\n"
+        "ldr qW22A, [wptr1, w_col_stride1]\n"
+        "ldr qU14A, [%x[uptr0], u_col_stride3]\n"
+        "ldr qW32A, [wptr2, w_col_stride1]\n"
+        "ldr qU13A, [%x[uptr0], u_col_stride2]\n"
+        "ldr qU25A, [uptr1, u_col_stride4]\n"
+        "ldr qU24A, [uptr1, u_col_stride3]\n"
+        "ldr qW11A, [%x[wptr0]], #0x10\n"
+        "ldr qU23A, [uptr1, u_col_stride2]\n"
+        "ldr qW21A, [wptr1], #0x10\n"
+        "ldr qW31A, [wptr2], #0x10\n"
+        "ldr qU34A, [uptr2, u_col_stride3]\n"
+        "ldr qU35A, [uptr2, u_col_stride4]\n"
+
+        // First part of A
+        "fmul vV13A.4s, vU15A.4s, vW13A.4s\n"
+        "ldr qU33A, [uptr2, u_col_stride2]\n"
+        "fmul vV12A.4s, vU14A.4s, vW13A.4s\n"
+        "cbz %x[n_iters], 2f\n"  // Jump to tail if not looping
+
+        "1:"  // Main loop, double unrolled
+        // A Part
+        "fmla vV13A.4s, vU14A.4s, vW12A.4s\n"
+        "ldr qU45A, [uptr3, u_col_stride4]\n"
+        "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
+        "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
+        "fmla vV13A.4s, vU13A.4s, vW11A.4s\n"
+        "ldr qU44A, [uptr3, u_col_stride3]\n"
+        "fmla vV13A.4s, vU25A.4s, vW23A.4s\n"
+        "fmul vV23A.4s, vU25A.4s, vW13A.4s\n"
+        "ldr qU43A, [uptr3, u_col_stride2]\n"
+        "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
+        "fmla vV13A.4s, vU24A.4s, vW22A.4s\n"
+        "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
+        "fmla vV23A.4s, vU24A.4s, vW12A.4s\n"
+        "ldr qU55A, [uptr4, u_col_stride4]\n"
+        "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
+        "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
+        "fmla vV13A.4s, vU23A.4s, vW21A.4s\n"
+        "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
+        "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
+        "fmla vV23A.4s, vU23A.4s, vW11A.4s\n"
+        "ldr qU54A, [uptr4, u_col_stride3]\n"
+        "fmla vV13A.4s, vU35A.4s, vW33A.4s\n"
+        "fmla vV23A.4s, vU35A.4s, vW23A.4s\n"
+        "fmul vV33A.4s, vU35A.4s, vW13A.4s\n"
+        "ldr qU53A, [uptr4, u_col_stride2]\n"
+        "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
+        "fmla vV13A.4s, vU34A.4s, vW32A.4s\n"
+        "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
+        "fmla vV23A.4s, vU34A.4s, vW22A.4s\n"
+        "fmul vV32A.4s, vU34A.4s, vW13A.4s\n"
+        "fmla vV33A.4s, vU34A.4s, vW12A.4s\n"
+        "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
+        "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
+        "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
+        "fmla vV13A.4s, vU33A.4s, vW31A.4s\n"
+        "str qV13A, [%x[vptr0], v_col_stride2]\n"
+        "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
+        "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
+        "fmla vV23A.4s, vU33A.4s, vW21A.4s\n"
+        "fmul vV31A.4s, vU33A.4s, vW13A.4s\n"
+        "ldr qW13B, [%x[wptr0], w_col_stride2]\n"
+        "fmla vV32A.4s, vU33A.4s, vW12A.4s\n"
+        "fmla vV33A.4s, vU33A.4s, vW11A.4s\n"
+        "ldr qU22A, [uptr1, u_col_stride1]\n"
+        "fmla vV23A.4s, vU45A.4s, vW33A.4s\n"
+        "fmla vV33A.4s, vU45A.4s, vW23A.4s\n"
+        "ldr qU32A, [uptr2, u_col_stride1]\n"
+        "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
+        "fmla vV23A.4s, vU44A.4s, vW32A.4s\n"
+        "fmla vV32A.4s, vU44A.4s, vW23A.4s\n"
+        "fmla vV33A.4s, vU44A.4s, vW22A.4s\n"
+        "ldr qU42A, [uptr3, u_col_stride1]\n"
+        "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
+        "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
+        "fmla vV23A.4s, vU43A.4s, vW31A.4s\n"
+        "str qV23A, [vptr1, v_col_stride2]\n"
+        "fmla vV31A.4s, vU43A.4s, vW23A.4s\n"
+        "ldr qW23B, [wptr1, w_col_stride2]\n"
+        "fmla vV32A.4s, vU43A.4s, vW22A.4s\n"
+        "fmla vV33A.4s, vU43A.4s, vW21A.4s\n"
+        "ldr qU52A, [uptr4, u_col_stride1]\n"
+        "fmla vV33A.4s, vU55A.4s, vW33A.4s\n"
+        "ldr qU11A, [%x[uptr0]], #0x10\n"
+        "fmla vV32A.4s, vU54A.4s, vW33A.4s\n"
+        "fmla vV33A.4s, vU54A.4s, vW32A.4s\n"
+        "ldr qU21A, [uptr1], #0x10\n"
+        "fmla vV31A.4s, vU53A.4s, vW33A.4s\n"
+        "ldr qW33B, [wptr2, w_col_stride2]\n"
+        "fmla vV32A.4s, vU53A.4s, vW32A.4s\n"
+        "fmla vV33A.4s, vU53A.4s, vW31A.4s\n"
+        "str qV33A, [vptr2, v_col_stride2]\n"
+        "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
+        "ldr qU31A, [uptr2], #0x10\n"
+        "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
+        "ldr qU41A, [uptr3], #0x10\n"
+        "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
+        "ldr qU51A, [uptr4], #0x10\n"
+        "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
+        "ldr qW12B, [%x[wptr0], w_col_stride1]\n"
+        "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
+        "ldr qU15B, [%x[uptr0], u_col_stride4]\n"
+        "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
+        "ldr qW22B, [wptr1, w_col_stride1]\n"
+        "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
+        "ldr qU14B, [%x[uptr0], u_col_stride3]\n"
+        "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
+        "str qV12A, [%x[vptr0], v_col_stride1]\n"
+        "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
+        "ldr qW32B, [wptr2, w_col_stride1]\n"
+        "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
+        "ldr qU13B, [%x[uptr0], u_col_stride2]\n"
+        "fmla vV31A.4s, vU32A.4s, vW12A.4s\n"
+        "ldr qU25B, [uptr1, u_col_stride4]\n"
+        "fmla vV32A.4s, vU32A.4s, vW11A.4s\n"
+        "ldr qU24B, [uptr1, u_col_stride3]\n"
+        "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
+        "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
+        "str qV22A, [vptr1, v_col_stride1]\n"
+        "fmla vV31A.4s, vU42A.4s, vW22A.4s\n"
+        "fmla vV32A.4s, vU42A.4s, vW21A.4s\n"
+        "fmla vV31A.4s, vU52A.4s, vW32A.4s\n"
+        "fmla vV32A.4s, vU52A.4s, vW31A.4s\n"
+        "str qV32A, [vptr2, v_col_stride1]\n"
+        "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
+        "ldr qW11B, [%x[wptr0]], #0x10\n"
+        "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
+        "ldr qU23B, [uptr1, u_col_stride2]\n"
+        "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
+        "ldr qW21B, [wptr1], #0x10\n"
+        "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
+        "str qV11A, [%x[vptr0]], #0x10\n"
+        "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
+        "ldr qW31B, [wptr2], #0x10\n"
+        "fmla vV31A.4s, vU31A.4s, vW11A.4s\n"
+        "ldr qU34B, [uptr2, u_col_stride3]\n"
+        "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
+        "str qV21A, [vptr1], #0x10\n"
+        "fmla vV31A.4s, vU41A.4s, vW21A.4s\n"
+        "ldr qU35B, [uptr2, u_col_stride4]\n"
+        "fmla vV31A.4s, vU51A.4s, vW31A.4s\n"
+        "str qV31A, [vptr2], #0x10\n"
+
+        // B Part
+        "fmul vV13B.4s, vU15B.4s, vW13B.4s\n"
+        "ldr qU33B, [uptr2, u_col_stride2]\n"
+        "fmul vV12B.4s, vU14B.4s, vW13B.4s\n"
+        "fmla vV13B.4s, vU14B.4s, vW12B.4s\n"
+        "ldr qU45B, [uptr3, u_col_stride4]\n"
+        "fmul vV11B.4s, vU13B.4s, vW13B.4s\n"
+        "fmla vV12B.4s, vU13B.4s, vW12B.4s\n"
+        "fmla vV13B.4s, vU13B.4s, vW11B.4s\n"
+        "ldr qU44B, [uptr3, u_col_stride3]\n"
+        "fmla vV13B.4s, vU25B.4s, vW23B.4s\n"
+        "fmul vV23B.4s, vU25B.4s, vW13B.4s\n"
+        "ldr qU43B, [uptr3, u_col_stride2]\n"
+        "fmla vV12B.4s, vU24B.4s, vW23B.4s\n"
+        "fmla vV13B.4s, vU24B.4s, vW22B.4s\n"
+        "fmul vV22B.4s, vU24B.4s, vW13B.4s\n"
+        "fmla vV23B.4s, vU24B.4s, vW12B.4s\n"
+        "ldr qU55B, [uptr4, u_col_stride4]\n"
+        "fmla vV11B.4s, vU23B.4s, vW23B.4s\n"
+        "fmla vV12B.4s, vU23B.4s, vW22B.4s\n"
+        "fmla vV13B.4s, vU23B.4s, vW21B.4s\n"
+        "fmul vV21B.4s, vU23B.4s, vW13B.4s\n"
+        "fmla vV22B.4s, vU23B.4s, vW12B.4s\n"
+        "fmla vV23B.4s, vU23B.4s, vW11B.4s\n"
+        "ldr qU54B, [uptr4, u_col_stride3]\n"
+        "fmla vV13B.4s, vU35B.4s, vW33B.4s\n"
+        "fmla vV23B.4s, vU35B.4s, vW23B.4s\n"
+        "fmul vV33B.4s, vU35B.4s, vW13B.4s\n"
+        "ldr qU53B, [uptr4, u_col_stride2]\n"
+        "fmla vV12B.4s, vU34B.4s, vW33B.4s\n"
+        "fmla vV13B.4s, vU34B.4s, vW32B.4s\n"
+        "fmla vV22B.4s, vU34B.4s, vW23B.4s\n"
+        "fmla vV23B.4s, vU34B.4s, vW22B.4s\n"
+        "fmul vV32B.4s, vU34B.4s, vW13B.4s\n"
+        "fmla vV33B.4s, vU34B.4s, vW12B.4s\n"
+        "ldr qU12B, [%x[uptr0], u_col_stride1]\n"
+        "fmla vV11B.4s, vU33B.4s, vW33B.4s\n"
+        "fmla vV12B.4s, vU33B.4s, vW32B.4s\n"
+        "fmla vV13B.4s, vU33B.4s, vW31B.4s\n"
+        "str qV13B, [%x[vptr0], v_col_stride2]\n"
+        "fmla vV21B.4s, vU33B.4s, vW23B.4s\n"
+        "fmla vV22B.4s, vU33B.4s, vW22B.4s\n"
+        "fmla vV23B.4s, vU33B.4s, vW21B.4s\n"
+        "fmul vV31B.4s, vU33B.4s, vW13B.4s\n"
+        "ldr qW13A, [%x[wptr0], w_col_stride2]\n"
+        "fmla vV32B.4s, vU33B.4s, vW12B.4s\n"
+        "fmla vV33B.4s, vU33B.4s, vW11B.4s\n"
+        "ldr qU22B, [uptr1, u_col_stride1]\n"
+        "fmla vV23B.4s, vU45B.4s, vW33B.4s\n"
+        "fmla vV33B.4s, vU45B.4s, vW23B.4s\n"
+        "ldr qU32B, [uptr2, u_col_stride1]\n"
+        "fmla vV22B.4s, vU44B.4s, vW33B.4s\n"
+        "fmla vV23B.4s, vU44B.4s, vW32B.4s\n"
+        "fmla vV32B.4s, vU44B.4s, vW23B.4s\n"
+        "fmla vV33B.4s, vU44B.4s, vW22B.4s\n"
+        "ldr qU42B, [uptr3, u_col_stride1]\n"
+        "fmla vV21B.4s, vU43B.4s, vW33B.4s\n"
+        "fmla vV22B.4s, vU43B.4s, vW32B.4s\n"
+        "fmla vV23B.4s, vU43B.4s, vW31B.4s\n"
+        "str qV23B, [vptr1, v_col_stride2]\n"
+        "fmla vV31B.4s, vU43B.4s, vW23B.4s\n"
+        "ldr qW23A, [wptr1, w_col_stride2]\n"
+        "fmla vV32B.4s, vU43B.4s, vW22B.4s\n"
+        "fmla vV33B.4s, vU43B.4s, vW21B.4s\n"
+        "ldr qU52B, [uptr4, u_col_stride1]\n"
+        "fmla vV33B.4s, vU55B.4s, vW33B.4s\n"
+        "ldr qU11B, [%x[uptr0]], #0x10\n"
+        "fmla vV32B.4s, vU54B.4s, vW33B.4s\n"
+        "fmla vV33B.4s, vU54B.4s, vW32B.4s\n"
+        "ldr qU21B, [uptr1], #0x10\n"
+        "fmla vV31B.4s, vU53B.4s, vW33B.4s\n"
+        "ldr qW33A, [wptr2, w_col_stride2]\n"
+        "fmla vV32B.4s, vU53B.4s, vW32B.4s\n"
+        "fmla vV33B.4s, vU53B.4s, vW31B.4s\n"
+        "str qV33B, [vptr2, v_col_stride2]\n"
+        "fmla vV11B.4s, vU12B.4s, vW12B.4s\n"
+        "ldr qU31B, [uptr2], #0x10\n"
+        "fmla vV12B.4s, vU12B.4s, vW11B.4s\n"
+        "ldr qU41B, [uptr3], #0x10\n"
+        "fmla vV11B.4s, vU22B.4s, vW22B.4s\n"
+        "ldr qU51B, [uptr4], #0x10\n"
+        "fmla vV12B.4s, vU22B.4s, vW21B.4s\n"
+        "ldr qW12A, [%x[wptr0], w_col_stride1]\n"
+        "fmla vV21B.4s, vU22B.4s, vW12B.4s\n"
+        "ldr qU15A, [%x[uptr0], u_col_stride4]\n"
+        "fmla vV22B.4s, vU22B.4s, vW11B.4s\n"
+        "ldr qW22A, [wptr1, w_col_stride1]\n"
+        "fmla vV11B.4s, vU32B.4s, vW32B.4s\n"
+        "ldr qU14A, [%x[uptr0], u_col_stride3]\n"
+        "fmla vV12B.4s, vU32B.4s, vW31B.4s\n"
+        "str qV12B, [%x[vptr0], v_col_stride1]\n"
+        "fmla vV21B.4s, vU32B.4s, vW22B.4s\n"
+        "ldr qW32A, [wptr2, w_col_stride1]\n"
+        "fmla vV22B.4s, vU32B.4s, vW21B.4s\n"
+        "ldr qU13A, [%x[uptr0], u_col_stride2]\n"
+        "fmla vV31B.4s, vU32B.4s, vW12B.4s\n"
+        "ldr qU25A, [uptr1, u_col_stride4]\n"
+        "fmla vV32B.4s, vU32B.4s, vW11B.4s\n"
+        "ldr qU24A, [uptr1, u_col_stride3]\n"
+        "fmla vV21B.4s, vU42B.4s, vW32B.4s\n"
+        "fmla vV22B.4s, vU42B.4s, vW31B.4s\n"
+        "str qV22B, [vptr1, v_col_stride1]\n"
+        "fmla vV31B.4s, vU42B.4s, vW22B.4s\n"
+        "fmla vV32B.4s, vU42B.4s, vW21B.4s\n"
+        "fmla vV31B.4s, vU52B.4s, vW32B.4s\n"
+        "subs %x[n_iters], %x[n_iters], #1\n"
+        "fmla vV32B.4s, vU52B.4s, vW31B.4s\n"
+        "str qV32B, [vptr2, v_col_stride1]\n"
+        "fmla vV11B.4s, vU11B.4s, vW11B.4s\n"
+        "ldr qW11A, [%x[wptr0]], #0x10\n"
+        "fmla vV11B.4s, vU21B.4s, vW21B.4s\n"
+        "ldr qU23A, [uptr1, u_col_stride2]\n"
+        "fmla vV21B.4s, vU21B.4s, vW11B.4s\n"
+        "ldr qW21A, [wptr1], #0x10\n"
+        "fmla vV11B.4s, vU31B.4s, vW31B.4s\n"
+        "str qV11B, [%x[vptr0]], #0x10\n"
+        "fmla vV21B.4s, vU31B.4s, vW21B.4s\n"
+        "ldr qW31A, [wptr2], #0x10\n"
+        "fmla vV31B.4s, vU31B.4s, vW11B.4s\n"
+        "ldr qU34A, [uptr2, u_col_stride3]\n"
+        "fmla vV21B.4s, vU41B.4s, vW31B.4s\n"
+        "str qV21B, [vptr1], #0x10\n"
+        "fmla vV31B.4s, vU41B.4s, vW21B.4s\n"
+        "ldr qU35A, [uptr2, u_col_stride4]\n"
+        "fmla vV31B.4s, vU51B.4s, vW31B.4s\n"
+        "str qV31B, [vptr2], #0x10\n"
+
+        // First part of A
+        "fmul vV13A.4s, vU15A.4s, vW13A.4s\n"
+        "ldr qU33A, [uptr2, u_col_stride2]\n"
+        "fmul vV12A.4s, vU14A.4s, vW13A.4s\n"
+        "bne 1b\n"  // Loop
+
+        "2:"  // Tail dispatch
+        "cbnz %w[odd_tail], 3f\n"
+
+        // Even tail
+        // A Part
+        "fmla vV13A.4s, vU14A.4s, vW12A.4s\n"
+        "ldr qU45A, [uptr3, u_col_stride4]\n"
+        "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
+        "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
+        "fmla vV13A.4s, vU13A.4s, vW11A.4s\n"
+        "ldr qU44A, [uptr3, u_col_stride3]\n"
+        "fmla vV13A.4s, vU25A.4s, vW23A.4s\n"
+        "fmul vV23A.4s, vU25A.4s, vW13A.4s\n"
+        "ldr qU43A, [uptr3, u_col_stride2]\n"
+        "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
+        "fmla vV13A.4s, vU24A.4s, vW22A.4s\n"
+        "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
+        "fmla vV23A.4s, vU24A.4s, vW12A.4s\n"
+        "ldr qU55A, [uptr4, u_col_stride4]\n"
+        "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
+        "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
+        "fmla vV13A.4s, vU23A.4s, vW21A.4s\n"
+        "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
+        "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
+        "fmla vV23A.4s, vU23A.4s, vW11A.4s\n"
+        "ldr qU54A, [uptr4, u_col_stride3]\n"
+        "fmla vV13A.4s, vU35A.4s, vW33A.4s\n"
+        "fmla vV23A.4s, vU35A.4s, vW23A.4s\n"
+        "fmul vV33A.4s, vU35A.4s, vW13A.4s\n"
+        "ldr qU53A, [uptr4, u_col_stride2]\n"
+        "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
+        "fmla vV13A.4s, vU34A.4s, vW32A.4s\n"
+        "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
+        "fmla vV23A.4s, vU34A.4s, vW22A.4s\n"
+        "fmul vV32A.4s, vU34A.4s, vW13A.4s\n"
+        "fmla vV33A.4s, vU34A.4s, vW12A.4s\n"
+        "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
+        "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
+        "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
+        "fmla vV13A.4s, vU33A.4s, vW31A.4s\n"
+        "str qV13A, [%x[vptr0], v_col_stride2]\n"
+        "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
+        "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
+        "fmla vV23A.4s, vU33A.4s, vW21A.4s\n"
+        "fmul vV31A.4s, vU33A.4s, vW13A.4s\n"
+        "ldr qW13B, [%x[wptr0], w_col_stride2]\n"
+        "fmla vV32A.4s, vU33A.4s, vW12A.4s\n"
+        "fmla vV33A.4s, vU33A.4s, vW11A.4s\n"
+        "ldr qU22A, [uptr1, u_col_stride1]\n"
+        "fmla vV23A.4s, vU45A.4s, vW33A.4s\n"
+        "fmla vV33A.4s, vU45A.4s, vW23A.4s\n"
+        "ldr qU32A, [uptr2, u_col_stride1]\n"
+        "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
+        "fmla vV23A.4s, vU44A.4s, vW32A.4s\n"
+        "fmla vV32A.4s, vU44A.4s, vW23A.4s\n"
+        "fmla vV33A.4s, vU44A.4s, vW22A.4s\n"
+        "ldr qU42A, [uptr3, u_col_stride1]\n"
+        "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
+        "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
+        "fmla vV23A.4s, vU43A.4s, vW31A.4s\n"
+        "str qV23A, [vptr1, v_col_stride2]\n"
+        "fmla vV31A.4s, vU43A.4s, vW23A.4s\n"
+        "ldr qW23B, [wptr1, w_col_stride2]\n"
+        "fmla vV32A.4s, vU43A.4s, vW22A.4s\n"
+        "fmla vV33A.4s, vU43A.4s, vW21A.4s\n"
+        "ldr qU52A, [uptr4, u_col_stride1]\n"
+        "fmla vV33A.4s, vU55A.4s, vW33A.4s\n"
+        "ldr qU11A, [%x[uptr0]], #0x10\n"
+        "fmla vV32A.4s, vU54A.4s, vW33A.4s\n"
+        "fmla vV33A.4s, vU54A.4s, vW32A.4s\n"
+        "ldr qU21A, [uptr1], #0x10\n"
+        "fmla vV31A.4s, vU53A.4s, vW33A.4s\n"
+        "ldr qW33B, [wptr2, w_col_stride2]\n"
+        "fmla vV32A.4s, vU53A.4s, vW32A.4s\n"
+        "fmla vV33A.4s, vU53A.4s, vW31A.4s\n"
+        "str qV33A, [vptr2, v_col_stride2]\n"
+        "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
+        "ldr qU31A, [uptr2], #0x10\n"
+        "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
+        "ldr qU41A, [uptr3], #0x10\n"
+        "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
+        "ldr qU51A, [uptr4], #0x10\n"
+        "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
+        "ldr qW12B, [%x[wptr0], w_col_stride1]\n"
+        "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
+        "ldr qU15B, [%x[uptr0], u_col_stride4]\n"
+        "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
+        "ldr qW22B, [wptr1, w_col_stride1]\n"
+        "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
+        "ldr qU14B, [%x[uptr0], u_col_stride3]\n"
+        "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
+        "str qV12A, [%x[vptr0], v_col_stride1]\n"
+        "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
+        "ldr qW32B, [wptr2, w_col_stride1]\n"
+        "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
+        "ldr qU13B, [%x[uptr0], u_col_stride2]\n"
+        "fmla vV31A.4s, vU32A.4s, vW12A.4s\n"
+        "ldr qU25B, [uptr1, u_col_stride4]\n"
+        "fmla vV32A.4s, vU32A.4s, vW11A.4s\n"
+        "ldr qU24B, [uptr1, u_col_stride3]\n"
+        "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
+        "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
+        "str qV22A, [vptr1, v_col_stride1]\n"
+        "fmla vV31A.4s, vU42A.4s, vW22A.4s\n"
+        "fmla vV32A.4s, vU42A.4s, vW21A.4s\n"
+        "fmla vV31A.4s, vU52A.4s, vW32A.4s\n"
+        "fmla vV32A.4s, vU52A.4s, vW31A.4s\n"
+        "str qV32A, [vptr2, v_col_stride1]\n"
+        "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
+        "ldr qW11B, [%x[wptr0]], #0x10\n"
+        "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
+        "ldr qU23B, [uptr1, u_col_stride2]\n"
+        "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
+        "ldr qW21B, [wptr1], #0x10\n"
+        "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
+        "str qV11A, [%x[vptr0]], #0x10\n"
+        "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
+        "ldr qW31B, [wptr2], #0x10\n"
+        "fmla vV31A.4s, vU31A.4s, vW11A.4s\n"
+        "ldr qU34B, [uptr2, u_col_stride3]\n"
+        "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
+        "str qV21A, [vptr1], #0x10\n"
+        "fmla vV31A.4s, vU41A.4s, vW21A.4s\n"
+        "ldr qU35B, [uptr2, u_col_stride4]\n"
+        "fmla vV31A.4s, vU51A.4s, vW31A.4s\n"
+        "str qV31A, [vptr2], #0x10\n"
+
+        // B Part
+        "fmul vV13B.4s, vU15B.4s, vW13B.4s\n"
+        "ldr qU33B, [uptr2, u_col_stride2]\n"
+        "fmul vV12B.4s, vU14B.4s, vW13B.4s\n"
+        "fmla vV13B.4s, vU14B.4s, vW12B.4s\n"
+        "ldr qU45B, [uptr3, u_col_stride4]\n"
+        "fmul vV11B.4s, vU13B.4s, vW13B.4s\n"
+        "fmla vV12B.4s, vU13B.4s, vW12B.4s\n"
+        "fmla vV13B.4s, vU13B.4s, vW11B.4s\n"
+        "ldr qU44B, [uptr3, u_col_stride3]\n"
+        "fmla vV13B.4s, vU25B.4s, vW23B.4s\n"
+        "fmul vV23B.4s, vU25B.4s, vW13B.4s\n"
+        "ldr qU43B, [uptr3, u_col_stride2]\n"
+        "fmla vV12B.4s, vU24B.4s, vW23B.4s\n"
+        "fmla vV13B.4s, vU24B.4s, vW22B.4s\n"
+        "fmul vV22B.4s, vU24B.4s, vW13B.4s\n"
+        "fmla vV23B.4s, vU24B.4s, vW12B.4s\n"
+        "ldr qU55B, [uptr4, u_col_stride4]\n"
+        "fmla vV11B.4s, vU23B.4s, vW23B.4s\n"
+        "fmla vV12B.4s, vU23B.4s, vW22B.4s\n"
+        "fmla vV13B.4s, vU23B.4s, vW21B.4s\n"
+        "fmul vV21B.4s, vU23B.4s, vW13B.4s\n"
+        "fmla vV22B.4s, vU23B.4s, vW12B.4s\n"
+        "fmla vV23B.4s, vU23B.4s, vW11B.4s\n"
+        "ldr qU54B, [uptr4, u_col_stride3]\n"
+        "fmla vV13B.4s, vU35B.4s, vW33B.4s\n"
+        "fmla vV23B.4s, vU35B.4s, vW23B.4s\n"
+        "fmul vV33B.4s, vU35B.4s, vW13B.4s\n"
+        "ldr qU53B, [uptr4, u_col_stride2]\n"
+        "fmla vV12B.4s, vU34B.4s, vW33B.4s\n"
+        "fmla vV13B.4s, vU34B.4s, vW32B.4s\n"
+        "fmla vV22B.4s, vU34B.4s, vW23B.4s\n"
+        "fmla vV23B.4s, vU34B.4s, vW22B.4s\n"
+        "fmul vV32B.4s, vU34B.4s, vW13B.4s\n"
+        "fmla vV33B.4s, vU34B.4s, vW12B.4s\n"
+        "ldr qU12B, [%x[uptr0], u_col_stride1]\n"
+        "fmla vV11B.4s, vU33B.4s, vW33B.4s\n"
+        "fmla vV12B.4s, vU33B.4s, vW32B.4s\n"
+        "fmla vV13B.4s, vU33B.4s, vW31B.4s\n"
+        "str qV13B, [%x[vptr0], v_col_stride2]\n"
+        "fmla vV21B.4s, vU33B.4s, vW23B.4s\n"
+        "fmla vV22B.4s, vU33B.4s, vW22B.4s\n"
+        "fmla vV23B.4s, vU33B.4s, vW21B.4s\n"
+        "fmul vV31B.4s, vU33B.4s, vW13B.4s\n"
+        "fmla vV32B.4s, vU33B.4s, vW12B.4s\n"
+        "fmla vV33B.4s, vU33B.4s, vW11B.4s\n"
+        "ldr qU22B, [uptr1, u_col_stride1]\n"
+        "fmla vV23B.4s, vU45B.4s, vW33B.4s\n"
+        "fmla vV33B.4s, vU45B.4s, vW23B.4s\n"
+        "ldr qU32B, [uptr2, u_col_stride1]\n"
+        "fmla vV22B.4s, vU44B.4s, vW33B.4s\n"
+        "fmla vV23B.4s, vU44B.4s, vW32B.4s\n"
+        "fmla vV32B.4s, vU44B.4s, vW23B.4s\n"
+        "fmla vV33B.4s, vU44B.4s, vW22B.4s\n"
+        "ldr qU42B, [uptr3, u_col_stride1]\n"
+        "fmla vV21B.4s, vU43B.4s, vW33B.4s\n"
+        "fmla vV22B.4s, vU43B.4s, vW32B.4s\n"
+        "fmla vV23B.4s, vU43B.4s, vW31B.4s\n"
+        "str qV23B, [vptr1, v_col_stride2]\n"
+        "fmla vV31B.4s, vU43B.4s, vW23B.4s\n"
+        "fmla vV32B.4s, vU43B.4s, vW22B.4s\n"
+        "fmla vV33B.4s, vU43B.4s, vW21B.4s\n"
+        "ldr qU52B, [uptr4, u_col_stride1]\n"
+        "fmla vV33B.4s, vU55B.4s, vW33B.4s\n"
+        "ldr qU11B, [%x[uptr0]], #0x10\n"
+        "fmla vV32B.4s, vU54B.4s, vW33B.4s\n"
+        "fmla vV33B.4s, vU54B.4s, vW32B.4s\n"
+        "ldr qU21B, [uptr1], #0x10\n"
+        "fmla vV31B.4s, vU53B.4s, vW33B.4s\n"
+        "fmla vV32B.4s, vU53B.4s, vW32B.4s\n"
+        "fmla vV33B.4s, vU53B.4s, vW31B.4s\n"
+        "str qV33B, [vptr2, v_col_stride2]\n"
+        "fmla vV11B.4s, vU12B.4s, vW12B.4s\n"
+        "ldr qU31B, [uptr2], #0x10\n"
+        "fmla vV12B.4s, vU12B.4s, vW11B.4s\n"
+        "ldr qU41B, [uptr3], #0x10\n"
+        "fmla vV11B.4s, vU22B.4s, vW22B.4s\n"
+        "ldr qU51B, [uptr4], #0x10\n"
+        "fmla vV12B.4s, vU22B.4s, vW21B.4s\n"
+        "fmla vV21B.4s, vU22B.4s, vW12B.4s\n"
+        "fmla vV22B.4s, vU22B.4s, vW11B.4s\n"
+        "fmla vV11B.4s, vU32B.4s, vW32B.4s\n"
+        "fmla vV12B.4s, vU32B.4s, vW31B.4s\n"
+        "str qV12B, [%x[vptr0], v_col_stride1]\n"
+        "fmla vV21B.4s, vU32B.4s, vW22B.4s\n"
+        "fmla vV22B.4s, vU32B.4s, vW21B.4s\n"
+        "fmla vV31B.4s, vU32B.4s, vW12B.4s\n"
+        "fmla vV32B.4s, vU32B.4s, vW11B.4s\n"
+        "fmla vV21B.4s, vU42B.4s, vW32B.4s\n"
+        "fmla vV22B.4s, vU42B.4s, vW31B.4s\n"
+        "str qV22B, [vptr1, v_col_stride1]\n"
+        "fmla vV31B.4s, vU42B.4s, vW22B.4s\n"
+        "fmla vV32B.4s, vU42B.4s, vW21B.4s\n"
+        "fmla vV31B.4s, vU52B.4s, vW32B.4s\n"
+        "subs %x[n_iters], %x[n_iters], #1\n"
+        "fmla vV32B.4s, vU52B.4s, vW31B.4s\n"
+        "str qV32B, [vptr2, v_col_stride1]\n"
+        "fmla vV11B.4s, vU11B.4s, vW11B.4s\n"
+        "fmla vV11B.4s, vU21B.4s, vW21B.4s\n"
+        "fmla vV21B.4s, vU21B.4s, vW11B.4s\n"
+        "fmla vV11B.4s, vU31B.4s, vW31B.4s\n"
+        "str qV11B, [%x[vptr0]], #0x10\n"
+        "fmla vV21B.4s, vU31B.4s, vW21B.4s\n"
+        "fmla vV31B.4s, vU31B.4s, vW11B.4s\n"
+        "fmla vV21B.4s, vU41B.4s, vW31B.4s\n"
+        "str qV21B, [vptr1], #0x10\n"
+        "fmla vV31B.4s, vU41B.4s, vW21B.4s\n"
+        "fmla vV31B.4s, vU51B.4s, vW31B.4s\n"
+        "str qV31B, [vptr2], #0x10\n"
+
+        "b 4f\n"  // Branch to end of method
+
+        "3:"  // Odd tail, finish off A
+        "fmla vV13A.4s, vU14A.4s, vW12A.4s\n"
+        "ldr qU45A, [uptr3, u_col_stride4]\n"
+        "fmul vV11A.4s, vU13A.4s, vW13A.4s\n"
+        "fmla vV12A.4s, vU13A.4s, vW12A.4s\n"
+        "fmla vV13A.4s, vU13A.4s, vW11A.4s\n"
+        "ldr qU44A, [uptr3, u_col_stride3]\n"
+        "fmla vV13A.4s, vU25A.4s, vW23A.4s\n"
+        "fmul vV23A.4s, vU25A.4s, vW13A.4s\n"
+        "ldr qU43A, [uptr3, u_col_stride2]\n"
+        "fmla vV12A.4s, vU24A.4s, vW23A.4s\n"
+        "fmla vV13A.4s, vU24A.4s, vW22A.4s\n"
+        "fmul vV22A.4s, vU24A.4s, vW13A.4s\n"
+        "fmla vV23A.4s, vU24A.4s, vW12A.4s\n"
+        "ldr qU55A, [uptr4, u_col_stride4]\n"
+        "fmla vV11A.4s, vU23A.4s, vW23A.4s\n"
+        "fmla vV12A.4s, vU23A.4s, vW22A.4s\n"
+        "fmla vV13A.4s, vU23A.4s, vW21A.4s\n"
+        "fmul vV21A.4s, vU23A.4s, vW13A.4s\n"
+        "fmla vV22A.4s, vU23A.4s, vW12A.4s\n"
+        "fmla vV23A.4s, vU23A.4s, vW11A.4s\n"
+        "ldr qU54A, [uptr4, u_col_stride3]\n"
+        "fmla vV13A.4s, vU35A.4s, vW33A.4s\n"
+        "fmla vV23A.4s, vU35A.4s, vW23A.4s\n"
+        "fmul vV33A.4s, vU35A.4s, vW13A.4s\n"
+        "ldr qU53A, [uptr4, u_col_stride2]\n"
+        "fmla vV12A.4s, vU34A.4s, vW33A.4s\n"
+        "fmla vV13A.4s, vU34A.4s, vW32A.4s\n"
+        "fmla vV22A.4s, vU34A.4s, vW23A.4s\n"
+        "fmla vV23A.4s, vU34A.4s, vW22A.4s\n"
+        "fmul vV32A.4s, vU34A.4s, vW13A.4s\n"
+        "fmla vV33A.4s, vU34A.4s, vW12A.4s\n"
+        "ldr qU12A, [%x[uptr0], u_col_stride1]\n"
+        "fmla vV11A.4s, vU33A.4s, vW33A.4s\n"
+        "fmla vV12A.4s, vU33A.4s, vW32A.4s\n"
+        "fmla vV13A.4s, vU33A.4s, vW31A.4s\n"
+        "str qV13A, [%x[vptr0], v_col_stride2]\n"
+        "fmla vV21A.4s, vU33A.4s, vW23A.4s\n"
+        "fmla vV22A.4s, vU33A.4s, vW22A.4s\n"
+        "fmla vV23A.4s, vU33A.4s, vW21A.4s\n"
+        "fmul vV31A.4s, vU33A.4s, vW13A.4s\n"
+        "fmla vV32A.4s, vU33A.4s, vW12A.4s\n"
+        "fmla vV33A.4s, vU33A.4s, vW11A.4s\n"
+        "ldr qU22A, [uptr1, u_col_stride1]\n"
+        "fmla vV23A.4s, vU45A.4s, vW33A.4s\n"
+        "fmla vV33A.4s, vU45A.4s, vW23A.4s\n"
+        "ldr qU32A, [uptr2, u_col_stride1]\n"
+        "fmla vV22A.4s, vU44A.4s, vW33A.4s\n"
+        "fmla vV23A.4s, vU44A.4s, vW32A.4s\n"
+        "fmla vV32A.4s, vU44A.4s, vW23A.4s\n"
+        "fmla vV33A.4s, vU44A.4s, vW22A.4s\n"
+        "ldr qU42A, [uptr3, u_col_stride1]\n"
+        "fmla vV21A.4s, vU43A.4s, vW33A.4s\n"
+        "fmla vV22A.4s, vU43A.4s, vW32A.4s\n"
+        "fmla vV23A.4s, vU43A.4s, vW31A.4s\n"
+        "str qV23A, [vptr1, v_col_stride2]\n"
+        "fmla vV31A.4s, vU43A.4s, vW23A.4s\n"
+        "fmla vV32A.4s, vU43A.4s, vW22A.4s\n"
+        "fmla vV33A.4s, vU43A.4s, vW21A.4s\n"
+        "ldr qU52A, [uptr4, u_col_stride1]\n"
+        "fmla vV33A.4s, vU55A.4s, vW33A.4s\n"
+        "ldr qU11A, [%x[uptr0]], #0x10\n"
+        "fmla vV32A.4s, vU54A.4s, vW33A.4s\n"
+        "fmla vV33A.4s, vU54A.4s, vW32A.4s\n"
+        "ldr qU21A, [uptr1], #0x10\n"
+        "fmla vV31A.4s, vU53A.4s, vW33A.4s\n"
+        "fmla vV32A.4s, vU53A.4s, vW32A.4s\n"
+        "fmla vV33A.4s, vU53A.4s, vW31A.4s\n"
+        "str qV33A, [vptr2, v_col_stride2]\n"
+        "fmla vV11A.4s, vU12A.4s, vW12A.4s\n"
+        "ldr qU31A, [uptr2], #0x10\n"
+        "fmla vV12A.4s, vU12A.4s, vW11A.4s\n"
+        "ldr qU41A, [uptr3], #0x10\n"
+        "fmla vV11A.4s, vU22A.4s, vW22A.4s\n"
+        "ldr qU51A, [uptr4], #0x10\n"
+        "fmla vV12A.4s, vU22A.4s, vW21A.4s\n"
+        "fmla vV21A.4s, vU22A.4s, vW12A.4s\n"
+        "fmla vV22A.4s, vU22A.4s, vW11A.4s\n"
+        "fmla vV11A.4s, vU32A.4s, vW32A.4s\n"
+        "fmla vV12A.4s, vU32A.4s, vW31A.4s\n"
+        "str qV12A, [%x[vptr0], v_col_stride1]\n"
+        "fmla vV21A.4s, vU32A.4s, vW22A.4s\n"
+        "fmla vV22A.4s, vU32A.4s, vW21A.4s\n"
+        "fmla vV31A.4s, vU32A.4s, vW12A.4s\n"
+        "fmla vV32A.4s, vU32A.4s, vW11A.4s\n"
+        "fmla vV21A.4s, vU42A.4s, vW32A.4s\n"
+        "fmla vV22A.4s, vU42A.4s, vW31A.4s\n"
+        "str qV22A, [vptr1, v_col_stride1]\n"
+        "fmla vV31A.4s, vU42A.4s, vW22A.4s\n"
+        "fmla vV32A.4s, vU42A.4s, vW21A.4s\n"
+        "fmla vV31A.4s, vU52A.4s, vW32A.4s\n"
+        "fmla vV32A.4s, vU52A.4s, vW31A.4s\n"
+        "str qV32A, [vptr2, v_col_stride1]\n"
+        "fmla vV11A.4s, vU11A.4s, vW11A.4s\n"
+        "fmla vV11A.4s, vU21A.4s, vW21A.4s\n"
+        "fmla vV21A.4s, vU21A.4s, vW11A.4s\n"
+        "fmla vV11A.4s, vU31A.4s, vW31A.4s\n"
+        "str qV11A, [%x[vptr0]], #0x10\n"
+        "fmla vV21A.4s, vU31A.4s, vW21A.4s\n"
+        "fmla vV31A.4s, vU31A.4s, vW11A.4s\n"
+        "fmla vV21A.4s, vU41A.4s, vW31A.4s\n"
+        "str qV21A, [vptr1], #0x10\n"
+        "fmla vV31A.4s, vU41A.4s, vW21A.4s\n"
+        "fmla vV31A.4s, vU51A.4s, vW31A.4s\n"
+        "str qV31A, [vptr2], #0x10\n"
+
+        "4:"  // End of method
+        ".unreq uptr1\n" ".unreq uptr2\n" ".unreq uptr3\n" ".unreq uptr4\n"
+        ".unreq u_col_stride1\n" ".unreq u_col_stride2\n"
+        ".unreq u_col_stride3\n" ".unreq u_col_stride4\n"
+        ".unreq wptr1\n" ".unreq wptr2\n"
+        ".unreq w_col_stride1\n" ".unreq w_col_stride2\n"
+        ".unreq vptr1\n" ".unreq vptr2\n"
+        ".unreq v_col_stride1\n" ".unreq v_col_stride2\n"
+
+        ".unreq qU22B\n" ".unreq qW13B\n" ".unreq qW13A\n" ".unreq qU51B\n"
+        ".unreq qU54B\n" ".unreq qU45A\n" ".unreq qU15A\n" ".unreq qU41B\n"
+        ".unreq qU24B\n" ".unreq qU21A\n"
+        ".unreq qV11B\n" ".unreq qU51A\n" ".unreq qU35A\n" ".unreq qU12A\n"
+        ".unreq qU42B\n" ".unreq qU44B\n" ".unreq qU13B\n" ".unreq qW33A\n"
+        ".unreq qV31B\n" ".unreq qV23A\n" ".unreq qU31A\n" ".unreq qU35B\n" ".unreq qU13A\n"
+        ".unreq qV23B\n" ".unreq qU11A\n" ".unreq qU25A\n" ".unreq qU43A\n" ".unreq qU52B\n"
+        ".unreq qU24A\n" ".unreq qU23B\n" ".unreq qV21A\n" ".unreq qV32B\n"
+        ".unreq qV33B\n" ".unreq qW11A\n" ".unreq qU31B\n"
+        ".unreq qW12B\n" ".unreq qU33A\n" ".unreq qU14A\n" ".unreq qU22A\n"
+        ".unreq qU25B\n" ".unreq qU53B\n" ".unreq qU42A\n" ".unreq qU44A\n"
+        ".unreq qU43B\n" ".unreq qW31A\n" ".unreq qU11B\n"
+        ".unreq qW11B\n" ".unreq qW32A\n"
+        ".unreq qU12B\n" ".unreq qU34B\n" ".unreq qW21A\n"
+        ".unreq qU14B\n" ".unreq qV21B\n" ".unreq qW22A\n"
+        ".unreq qW23B\n" ".unreq qW23A\n" ".unreq qU21B\n"
+        ".unreq qU32B\n" ".unreq qU34A\n" ".unreq qU45B\n" ".unreq qV31A\n"
+        ".unreq qW12A\n" ".unreq qU33B\n" ".unreq qU15B\n"
+        ".unreq qW33B\n" ".unreq qU54A\n" ".unreq qU23A\n"
+        ".unreq qW32B\n" ".unreq qV33A\n" ".unreq qW31B\n" ".unreq qV12A\n"
+        ".unreq qV12B\n" ".unreq qU41A\n" ".unreq qU53A\n"
+        ".unreq qV13A\n" ".unreq qU32A\n" ".unreq qW22B\n"
+        ".unreq qV22B\n" ".unreq qU52A\n" ".unreq qV13B\n" ".unreq qV32A\n"
+        ".unreq qU55A\n" ".unreq qU55B\n" ".unreq qV22A\n" ".unreq qW21B\n"
+        ".unreq qV11A\n"
+        ".unreq vU22B\n" ".unreq vW13B\n" ".unreq vW13A\n" ".unreq vU51B\n"
+        ".unreq vU54B\n" ".unreq vU45A\n" ".unreq vU15A\n" ".unreq vU41B\n"
+        ".unreq vU24B\n" ".unreq vU21A\n"
+        ".unreq vV11B\n" ".unreq vU51A\n" ".unreq vU35A\n" ".unreq vU12A\n"
+        ".unreq vU42B\n" ".unreq vU44B\n" ".unreq vU13B\n" ".unreq vW33A\n"
+        ".unreq vV31B\n" ".unreq vV23A\n" ".unreq vU31A\n" ".unreq vU35B\n" ".unreq vU13A\n"
+        ".unreq vV23B\n" ".unreq vU11A\n" ".unreq vU25A\n" ".unreq vU43A\n" ".unreq vU52B\n"
+        ".unreq vU24A\n" ".unreq vU23B\n" ".unreq vV21A\n" ".unreq vV32B\n"
+        ".unreq vV33B\n" ".unreq vW11A\n" ".unreq vU31B\n"
+        ".unreq vW12B\n" ".unreq vU33A\n" ".unreq vU14A\n" ".unreq vU22A\n"
+        ".unreq vU25B\n" ".unreq vU53B\n" ".unreq vU42A\n" ".unreq vU44A\n"
+        ".unreq vU43B\n" ".unreq vW31A\n" ".unreq vU11B\n"
+        ".unreq vW11B\n" ".unreq vW32A\n"
+        ".unreq vU12B\n" ".unreq vU34B\n" ".unreq vW21A\n"
+        ".unreq vU14B\n" ".unreq vV21B\n" ".unreq vW22A\n"
+        ".unreq vW23B\n" ".unreq vW23A\n" ".unreq vU21B\n"
+        ".unreq vU32B\n" ".unreq vU34A\n" ".unreq vU45B\n" ".unreq vV31A\n"
+        ".unreq vW12A\n" ".unreq vU33B\n" ".unreq vU15B\n"
+        ".unreq vW33B\n" ".unreq vU54A\n" ".unreq vU23A\n"
+        ".unreq vW32B\n" ".unreq vV33A\n" ".unreq vW31B\n" ".unreq vV12A\n"
+        ".unreq vV12B\n" ".unreq vU41A\n" ".unreq vU53A\n"
+        ".unreq vV13A\n" ".unreq vU32A\n" ".unreq vW22B\n"
+        ".unreq vV22B\n" ".unreq vU52A\n" ".unreq vV13B\n" ".unreq vV32A\n"
+        ".unreq vU55A\n" ".unreq vU55B\n" ".unreq vV22A\n" ".unreq vW21B\n"
+        ".unreq vV11A\n"
+        : [uptr0] "+r" (uptr0), [wptr0] "+r" (wptr0), [vptr0] "+r" (vptr0),
+          [n_iters] "+r" (n_iters)
+        : [u_row_stride] "r" (in_row_stride * sizeof(float)),
+          [u_col_stride] "r" (in_col_stride * sizeof(float)),
+          [w_row_stride] "r" (weight_row_stride * sizeof(float)),
+          [w_col_stride] "r" (weight_col_stride * sizeof(float)),
+          [v_row_stride] "r" (out_row_stride * sizeof(float)),
+          [v_col_stride] "r" (out_col_stride * sizeof(float)),
+          [odd_tail] "r" (odd_tail)
+        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+          "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
+          "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0",
+          "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11",
+          "x12", "cc", "memory"
+    );
+  }
+  if (channels_remaining)
+  {
+    // Fall back on the unoptimised version to clean up the tail
+    ConvImpl::process_tile<false>(
+        channels_remaining,
+        wptr0, weight_row_stride, weight_col_stride,
+        uptr0, in_row_stride, in_col_stride,
+        vptr0, out_row_stride, out_col_stride,
+        0, 0, 0, 0, 0, 0
+    );
+  }
+}
+
+#endif  // __aarch64__
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+  ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
 };
 
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+  ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+  },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+  },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
 
 template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float>;
 }  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
index 8d511b1..2510941 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp

@@ -28,3416 +28,596 @@
 using Conv = DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>;
 using ConvImpl = DepthwiseConvolutionImpl<3, 3, 3, 3, 2, 2, float, float>;
 
+#ifdef __aarch64__
+
 template <>
-const Conv::TileFn Conv::tile_fns
-  [max_in_pad_top]
-  [max_in_pad_left]
-  [max_in_pad_bottom]
-  [max_in_pad_right]
-  [max_out_pad_bottom]
-  [max_out_pad_right] = {
-  {  // Input pad top = 0
-    {  // Input pad left = 0
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 4, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 4, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 4, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 4, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 5, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 5, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 5, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 5, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 5, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 5, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 6, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 6, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 6, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 6, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 6, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 6, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 4, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 4, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 4, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 4, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 5, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 5, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 5, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 5, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 5, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 5, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 6, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 6, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 6, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 6, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 6, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 6, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 4, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 4, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 4, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 4, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 5, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 5, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 5, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 5, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 5, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 5, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 6, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 6, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 6, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 6, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 6, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 6, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 4, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 4, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 4, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 4, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 5, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 5, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 5, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 5, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 5, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 5, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 6, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 6, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 6, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 6, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 6, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 6, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 3
-      {  // Input pad bottom = 4
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 4, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 4, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 4, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 4, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 4, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 4, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 4, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 4, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 4, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 4, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 4, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 4, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 4, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 4, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 4, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 4, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 4, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 4, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 4, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 4, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 4, 0, 1>,
-            Conv::template process_tile<0, 0, 4, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 4, 1, 1>,
-            Conv::template process_tile<0, 0, 4, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 4, 4, 2, 0>,
-            Conv::template process_tile<0, 0, 4, 4, 2, 1>,
-            Conv::template process_tile<0, 0, 4, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 5, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 5, 0, 1>,
-            Conv::template process_tile<0, 0, 4, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 5, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 5, 1, 1>,
-            Conv::template process_tile<0, 0, 4, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 4, 5, 2, 0>,
-            Conv::template process_tile<0, 0, 4, 5, 2, 1>,
-            Conv::template process_tile<0, 0, 4, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 6, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 6, 0, 1>,
-            Conv::template process_tile<0, 0, 4, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 6, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 6, 1, 1>,
-            Conv::template process_tile<0, 0, 4, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 4, 6, 2, 0>,
-            Conv::template process_tile<0, 0, 4, 6, 2, 1>,
-            Conv::template process_tile<0, 0, 4, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 4
-      {  // Input pad bottom = 5
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 5, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 5, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 5, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 5, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 5, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 5, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 5, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 5, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 5, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 5, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 5, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 5, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 5, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 5, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 5, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 5, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 5, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 5, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 5, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 5, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 5, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 5, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 5, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 5, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 5, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 5, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 5, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 5, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 5, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 5, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 5, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 5, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 5, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 5, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 5, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 5, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 5, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 5, 4, 0, 1>,
-            Conv::template process_tile<0, 0, 5, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 5, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 5, 4, 1, 1>,
-            Conv::template process_tile<0, 0, 5, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 5, 4, 2, 0>,
-            Conv::template process_tile<0, 0, 5, 4, 2, 1>,
-            Conv::template process_tile<0, 0, 5, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 5, 5, 0, 0>,
-            Conv::template process_tile<0, 0, 5, 5, 0, 1>,
-            Conv::template process_tile<0, 0, 5, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 5, 5, 1, 0>,
-            Conv::template process_tile<0, 0, 5, 5, 1, 1>,
-            Conv::template process_tile<0, 0, 5, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 5, 5, 2, 0>,
-            Conv::template process_tile<0, 0, 5, 5, 2, 1>,
-            Conv::template process_tile<0, 0, 5, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 5, 6, 0, 0>,
-            Conv::template process_tile<0, 0, 5, 6, 0, 1>,
-            Conv::template process_tile<0, 0, 5, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 5, 6, 1, 0>,
-            Conv::template process_tile<0, 0, 5, 6, 1, 1>,
-            Conv::template process_tile<0, 0, 5, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 5, 6, 2, 0>,
-            Conv::template process_tile<0, 0, 5, 6, 2, 1>,
-            Conv::template process_tile<0, 0, 5, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 5
-      {  // Input pad bottom = 6
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 6, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 6, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 6, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 6, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 6, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 6, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 6, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 6, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 6, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 6, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 6, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 6, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 6, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 6, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 6, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 6, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 6, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 6, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 6, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 6, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 6, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 6, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 6, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 6, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 6, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 6, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 6, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 6, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 6, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 6, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 6, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 6, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 6, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 6, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 6, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 6, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 6, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 6, 4, 0, 1>,
-            Conv::template process_tile<0, 0, 6, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 6, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 6, 4, 1, 1>,
-            Conv::template process_tile<0, 0, 6, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 6, 4, 2, 0>,
-            Conv::template process_tile<0, 0, 6, 4, 2, 1>,
-            Conv::template process_tile<0, 0, 6, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 6, 5, 0, 0>,
-            Conv::template process_tile<0, 0, 6, 5, 0, 1>,
-            Conv::template process_tile<0, 0, 6, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 6, 5, 1, 0>,
-            Conv::template process_tile<0, 0, 6, 5, 1, 1>,
-            Conv::template process_tile<0, 0, 6, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 6, 5, 2, 0>,
-            Conv::template process_tile<0, 0, 6, 5, 2, 1>,
-            Conv::template process_tile<0, 0, 6, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 6, 6, 0, 0>,
-            Conv::template process_tile<0, 0, 6, 6, 0, 1>,
-            Conv::template process_tile<0, 0, 6, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 6, 6, 1, 0>,
-            Conv::template process_tile<0, 0, 6, 6, 1, 1>,
-            Conv::template process_tile<0, 0, 6, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 6, 6, 2, 0>,
-            Conv::template process_tile<0, 0, 6, 6, 2, 1>,
-            Conv::template process_tile<0, 0, 6, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 6
-    },  // Input pad left = 0
-    {  // Input pad left = 1
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 4, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 4, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 4, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 4, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 5, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 5, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 5, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 5, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 5, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 5, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 6, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 6, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 6, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 6, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 6, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 6, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 4, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 4, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 4, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 4, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 5, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 5, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 5, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 5, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 5, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 5, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 6, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 6, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 6, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 6, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 6, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 6, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 4, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 4, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 4, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 4, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 5, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 5, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 5, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 5, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 5, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 5, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 6, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 6, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 6, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 6, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 6, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 6, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 4, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 4, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 4, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 4, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 5, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 5, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 5, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 5, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 5, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 5, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 6, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 6, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 6, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 6, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 6, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 6, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 3
-      {  // Input pad bottom = 4
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 4, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 4, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 4, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 4, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 4, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 4, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 4, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 4, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 4, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 4, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 4, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 4, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 4, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 4, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 4, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 4, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 4, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 4, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 4, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 4, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 4, 0, 1>,
-            Conv::template process_tile<0, 1, 4, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 4, 1, 1>,
-            Conv::template process_tile<0, 1, 4, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 4, 4, 2, 0>,
-            Conv::template process_tile<0, 1, 4, 4, 2, 1>,
-            Conv::template process_tile<0, 1, 4, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 5, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 5, 0, 1>,
-            Conv::template process_tile<0, 1, 4, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 5, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 5, 1, 1>,
-            Conv::template process_tile<0, 1, 4, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 4, 5, 2, 0>,
-            Conv::template process_tile<0, 1, 4, 5, 2, 1>,
-            Conv::template process_tile<0, 1, 4, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 6, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 6, 0, 1>,
-            Conv::template process_tile<0, 1, 4, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 6, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 6, 1, 1>,
-            Conv::template process_tile<0, 1, 4, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 4, 6, 2, 0>,
-            Conv::template process_tile<0, 1, 4, 6, 2, 1>,
-            Conv::template process_tile<0, 1, 4, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 4
-      {  // Input pad bottom = 5
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 5, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 5, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 5, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 5, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 5, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 5, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 5, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 5, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 5, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 5, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 5, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 5, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 5, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 5, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 5, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 5, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 5, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 5, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 5, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 5, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 5, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 5, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 5, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 5, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 5, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 5, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 5, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 5, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 5, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 5, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 5, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 5, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 5, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 5, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 5, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 5, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 5, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 5, 4, 0, 1>,
-            Conv::template process_tile<0, 1, 5, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 5, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 5, 4, 1, 1>,
-            Conv::template process_tile<0, 1, 5, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 5, 4, 2, 0>,
-            Conv::template process_tile<0, 1, 5, 4, 2, 1>,
-            Conv::template process_tile<0, 1, 5, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 5, 5, 0, 0>,
-            Conv::template process_tile<0, 1, 5, 5, 0, 1>,
-            Conv::template process_tile<0, 1, 5, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 5, 5, 1, 0>,
-            Conv::template process_tile<0, 1, 5, 5, 1, 1>,
-            Conv::template process_tile<0, 1, 5, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 5, 5, 2, 0>,
-            Conv::template process_tile<0, 1, 5, 5, 2, 1>,
-            Conv::template process_tile<0, 1, 5, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 5, 6, 0, 0>,
-            Conv::template process_tile<0, 1, 5, 6, 0, 1>,
-            Conv::template process_tile<0, 1, 5, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 5, 6, 1, 0>,
-            Conv::template process_tile<0, 1, 5, 6, 1, 1>,
-            Conv::template process_tile<0, 1, 5, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 5, 6, 2, 0>,
-            Conv::template process_tile<0, 1, 5, 6, 2, 1>,
-            Conv::template process_tile<0, 1, 5, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 5
-      {  // Input pad bottom = 6
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 6, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 6, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 6, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 6, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 6, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 6, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 6, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 6, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 6, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 6, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 6, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 6, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 6, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 6, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 6, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 6, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 6, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 6, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 6, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 6, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 6, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 6, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 6, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 6, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 6, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 6, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 6, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 6, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 6, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 6, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 6, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 6, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 6, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 6, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 6, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 6, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 6, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 6, 4, 0, 1>,
-            Conv::template process_tile<0, 1, 6, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 6, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 6, 4, 1, 1>,
-            Conv::template process_tile<0, 1, 6, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 6, 4, 2, 0>,
-            Conv::template process_tile<0, 1, 6, 4, 2, 1>,
-            Conv::template process_tile<0, 1, 6, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 6, 5, 0, 0>,
-            Conv::template process_tile<0, 1, 6, 5, 0, 1>,
-            Conv::template process_tile<0, 1, 6, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 6, 5, 1, 0>,
-            Conv::template process_tile<0, 1, 6, 5, 1, 1>,
-            Conv::template process_tile<0, 1, 6, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 6, 5, 2, 0>,
-            Conv::template process_tile<0, 1, 6, 5, 2, 1>,
-            Conv::template process_tile<0, 1, 6, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 6, 6, 0, 0>,
-            Conv::template process_tile<0, 1, 6, 6, 0, 1>,
-            Conv::template process_tile<0, 1, 6, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 6, 6, 1, 0>,
-            Conv::template process_tile<0, 1, 6, 6, 1, 1>,
-            Conv::template process_tile<0, 1, 6, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 6, 6, 2, 0>,
-            Conv::template process_tile<0, 1, 6, 6, 2, 1>,
-            Conv::template process_tile<0, 1, 6, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 6
-    },  // Input pad left = 1
-  },  // Input pad top = 0
-  {  // Input pad top = 1
-    {  // Input pad left = 0
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 4, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 4, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 4, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 4, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 5, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 5, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 5, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 5, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 5, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 5, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 6, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 6, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 6, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 6, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 6, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 6, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 4, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 4, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 4, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 4, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 5, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 5, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 5, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 5, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 5, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 5, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 6, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 6, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 6, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 6, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 6, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 6, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 4, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 4, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 4, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 4, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 5, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 5, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 5, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 5, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 5, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 5, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 6, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 6, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 6, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 6, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 6, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 6, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 4, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 4, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 4, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 4, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 5, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 5, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 5, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 5, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 5, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 5, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 6, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 6, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 6, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 6, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 6, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 6, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 3
-      {  // Input pad bottom = 4
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 4, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 4, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 4, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 4, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 4, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 4, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 4, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 4, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 4, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 4, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 4, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 4, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 4, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 4, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 4, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 4, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 4, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 4, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 4, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 4, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 4, 0, 1>,
-            Conv::template process_tile<1, 0, 4, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 4, 1, 1>,
-            Conv::template process_tile<1, 0, 4, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 4, 4, 2, 0>,
-            Conv::template process_tile<1, 0, 4, 4, 2, 1>,
-            Conv::template process_tile<1, 0, 4, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 5, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 5, 0, 1>,
-            Conv::template process_tile<1, 0, 4, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 5, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 5, 1, 1>,
-            Conv::template process_tile<1, 0, 4, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 4, 5, 2, 0>,
-            Conv::template process_tile<1, 0, 4, 5, 2, 1>,
-            Conv::template process_tile<1, 0, 4, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 6, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 6, 0, 1>,
-            Conv::template process_tile<1, 0, 4, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 6, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 6, 1, 1>,
-            Conv::template process_tile<1, 0, 4, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 4, 6, 2, 0>,
-            Conv::template process_tile<1, 0, 4, 6, 2, 1>,
-            Conv::template process_tile<1, 0, 4, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 4
-      {  // Input pad bottom = 5
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 5, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 5, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 5, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 5, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 5, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 5, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 5, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 5, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 5, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 5, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 5, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 5, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 5, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 5, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 5, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 5, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 5, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 5, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 5, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 5, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 5, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 5, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 5, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 5, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 5, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 5, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 5, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 5, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 5, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 5, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 5, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 5, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 5, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 5, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 5, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 5, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 5, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 5, 4, 0, 1>,
-            Conv::template process_tile<1, 0, 5, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 5, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 5, 4, 1, 1>,
-            Conv::template process_tile<1, 0, 5, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 5, 4, 2, 0>,
-            Conv::template process_tile<1, 0, 5, 4, 2, 1>,
-            Conv::template process_tile<1, 0, 5, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 5, 5, 0, 0>,
-            Conv::template process_tile<1, 0, 5, 5, 0, 1>,
-            Conv::template process_tile<1, 0, 5, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 5, 5, 1, 0>,
-            Conv::template process_tile<1, 0, 5, 5, 1, 1>,
-            Conv::template process_tile<1, 0, 5, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 5, 5, 2, 0>,
-            Conv::template process_tile<1, 0, 5, 5, 2, 1>,
-            Conv::template process_tile<1, 0, 5, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 5, 6, 0, 0>,
-            Conv::template process_tile<1, 0, 5, 6, 0, 1>,
-            Conv::template process_tile<1, 0, 5, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 5, 6, 1, 0>,
-            Conv::template process_tile<1, 0, 5, 6, 1, 1>,
-            Conv::template process_tile<1, 0, 5, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 5, 6, 2, 0>,
-            Conv::template process_tile<1, 0, 5, 6, 2, 1>,
-            Conv::template process_tile<1, 0, 5, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 5
-      {  // Input pad bottom = 6
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 6, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 6, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 6, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 6, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 6, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 6, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 6, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 6, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 6, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 6, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 6, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 6, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 6, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 6, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 6, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 6, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 6, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 6, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 6, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 6, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 6, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 6, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 6, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 6, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 6, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 6, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 6, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 6, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 6, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 6, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 6, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 6, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 6, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 6, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 6, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 6, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 6, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 6, 4, 0, 1>,
-            Conv::template process_tile<1, 0, 6, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 6, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 6, 4, 1, 1>,
-            Conv::template process_tile<1, 0, 6, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 6, 4, 2, 0>,
-            Conv::template process_tile<1, 0, 6, 4, 2, 1>,
-            Conv::template process_tile<1, 0, 6, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 6, 5, 0, 0>,
-            Conv::template process_tile<1, 0, 6, 5, 0, 1>,
-            Conv::template process_tile<1, 0, 6, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 6, 5, 1, 0>,
-            Conv::template process_tile<1, 0, 6, 5, 1, 1>,
-            Conv::template process_tile<1, 0, 6, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 6, 5, 2, 0>,
-            Conv::template process_tile<1, 0, 6, 5, 2, 1>,
-            Conv::template process_tile<1, 0, 6, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 6, 6, 0, 0>,
-            Conv::template process_tile<1, 0, 6, 6, 0, 1>,
-            Conv::template process_tile<1, 0, 6, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 6, 6, 1, 0>,
-            Conv::template process_tile<1, 0, 6, 6, 1, 1>,
-            Conv::template process_tile<1, 0, 6, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 6, 6, 2, 0>,
-            Conv::template process_tile<1, 0, 6, 6, 2, 1>,
-            Conv::template process_tile<1, 0, 6, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 6
-    },  // Input pad left = 0
-    {  // Input pad left = 1
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 4, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 4, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 4, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 4, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 5, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 5, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 5, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 5, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 5, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 5, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 6, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 6, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 6, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 6, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 6, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 6, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 4, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 4, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 4, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 4, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 5, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 5, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 5, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 5, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 5, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 5, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 6, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 6, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 6, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 6, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 6, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 6, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 4, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 4, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 4, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 4, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 5, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 5, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 5, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 5, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 5, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 5, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 6, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 6, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 6, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 6, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 6, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 6, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 4, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 4, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 4, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 4, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 5, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 5, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 5, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 5, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 5, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 5, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 6, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 6, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 6, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 6, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 6, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 6, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 3
-      {  // Input pad bottom = 4
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 4, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 4, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 4, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 4, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 4, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 4, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 4, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 4, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 4, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 4, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 4, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 4, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 4, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 4, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 4, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 4, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 4, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 4, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 4, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 4, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 4, 0, 1>,
-            Conv::template process_tile<1, 1, 4, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 4, 1, 1>,
-            Conv::template process_tile<1, 1, 4, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 4, 4, 2, 0>,
-            Conv::template process_tile<1, 1, 4, 4, 2, 1>,
-            Conv::template process_tile<1, 1, 4, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 5, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 5, 0, 1>,
-            Conv::template process_tile<1, 1, 4, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 5, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 5, 1, 1>,
-            Conv::template process_tile<1, 1, 4, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 4, 5, 2, 0>,
-            Conv::template process_tile<1, 1, 4, 5, 2, 1>,
-            Conv::template process_tile<1, 1, 4, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 6, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 6, 0, 1>,
-            Conv::template process_tile<1, 1, 4, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 6, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 6, 1, 1>,
-            Conv::template process_tile<1, 1, 4, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 4, 6, 2, 0>,
-            Conv::template process_tile<1, 1, 4, 6, 2, 1>,
-            Conv::template process_tile<1, 1, 4, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 4
-      {  // Input pad bottom = 5
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 5, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 5, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 5, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 5, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 5, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 5, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 5, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 5, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 5, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 5, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 5, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 5, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 5, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 5, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 5, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 5, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 5, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 5, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 5, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 5, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 5, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 5, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 5, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 5, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 5, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 5, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 5, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 5, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 5, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 5, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 5, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 5, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 5, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 5, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 5, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 5, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 5, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 5, 4, 0, 1>,
-            Conv::template process_tile<1, 1, 5, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 5, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 5, 4, 1, 1>,
-            Conv::template process_tile<1, 1, 5, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 5, 4, 2, 0>,
-            Conv::template process_tile<1, 1, 5, 4, 2, 1>,
-            Conv::template process_tile<1, 1, 5, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 5, 5, 0, 0>,
-            Conv::template process_tile<1, 1, 5, 5, 0, 1>,
-            Conv::template process_tile<1, 1, 5, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 5, 5, 1, 0>,
-            Conv::template process_tile<1, 1, 5, 5, 1, 1>,
-            Conv::template process_tile<1, 1, 5, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 5, 5, 2, 0>,
-            Conv::template process_tile<1, 1, 5, 5, 2, 1>,
-            Conv::template process_tile<1, 1, 5, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 5, 6, 0, 0>,
-            Conv::template process_tile<1, 1, 5, 6, 0, 1>,
-            Conv::template process_tile<1, 1, 5, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 5, 6, 1, 0>,
-            Conv::template process_tile<1, 1, 5, 6, 1, 1>,
-            Conv::template process_tile<1, 1, 5, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 5, 6, 2, 0>,
-            Conv::template process_tile<1, 1, 5, 6, 2, 1>,
-            Conv::template process_tile<1, 1, 5, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 5
-      {  // Input pad bottom = 6
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 6, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 6, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 6, 0, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 6, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 6, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 6, 0, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 6, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 6, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 6, 0, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 6, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 6, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 6, 1, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 6, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 6, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 6, 1, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 6, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 6, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 6, 1, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 6, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 6, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 6, 2, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 6, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 6, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 6, 2, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 6, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 6, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 6, 2, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 6, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 6, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 6, 3, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 6, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 6, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 6, 3, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 6, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 6, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 6, 3, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 6, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 6, 4, 0, 1>,
-            Conv::template process_tile<1, 1, 6, 4, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 6, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 6, 4, 1, 1>,
-            Conv::template process_tile<1, 1, 6, 4, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 6, 4, 2, 0>,
-            Conv::template process_tile<1, 1, 6, 4, 2, 1>,
-            Conv::template process_tile<1, 1, 6, 4, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 6, 5, 0, 0>,
-            Conv::template process_tile<1, 1, 6, 5, 0, 1>,
-            Conv::template process_tile<1, 1, 6, 5, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 6, 5, 1, 0>,
-            Conv::template process_tile<1, 1, 6, 5, 1, 1>,
-            Conv::template process_tile<1, 1, 6, 5, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 6, 5, 2, 0>,
-            Conv::template process_tile<1, 1, 6, 5, 2, 1>,
-            Conv::template process_tile<1, 1, 6, 5, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 6, 6, 0, 0>,
-            Conv::template process_tile<1, 1, 6, 6, 0, 1>,
-            Conv::template process_tile<1, 1, 6, 6, 0, 2>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 6, 6, 1, 0>,
-            Conv::template process_tile<1, 1, 6, 6, 1, 1>,
-            Conv::template process_tile<1, 1, 6, 6, 1, 2>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 6, 6, 2, 0>,
-            Conv::template process_tile<1, 1, 6, 6, 2, 1>,
-            Conv::template process_tile<1, 1, 6, 6, 2, 2>,
-          },  // Output pad bottom = 2
-        },  // Input pad right = 6
-      },  // Input pad bottom = 6
-    },  // Input pad left = 1
-  },  // Input pad top = 1
+template <>
+void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
+  const int n_channels,
+  const float* const weights,
+  const int weight_row_stride,
+  const int weight_col_stride,
+  const float* const inptr,
+  const int in_row_stride,
+  const int in_col_stride,
+  float* const outptr,
+  const int out_row_stride,
+  const int out_col_stride,
+  const int, const int, const int, const int, const int, const int
+)
+{
+  // Copy pointers
+  const float *uptr0 = inptr;
+  const float *wptr0 = weights;
+  float *vptr0 = outptr;
+
+  int channels_remaining = n_channels;
+  if (channels_remaining >= 4)
+  {
+    // Process blocks of 4 channels at a time
+    int n_iters = channels_remaining / 4 - 1;
+    channels_remaining %= 4;
+
+    asm volatile(
+        // Prepare aliases
+        "qW13 .req q0\n" "vW13 .req v0\n"
+        "qU15 .req q1\n" "qU73 .req q1\n" "qU45 .req q1\n" "qU14 .req q1\n"
+        "vU15 .req v1\n" "vU73 .req v1\n" "vU45 .req v1\n" "vU14 .req v1\n"
+        "qU62 .req q2\n" "qV12 .req q2\n" "vU62 .req v2\n" "vV12 .req v2\n"
+        "qU51 .req q3\n" "qU43 .req q3\n" "qU55 .req q3\n"
+        "vU51 .req v3\n" "vU43 .req v3\n" "vU55 .req v3\n"
+        "qU77 .req q4\n" "qV13 .req q4\n" "qV31 .req q4\n" "qU44 .req q4\n"
+        "vU77 .req v4\n" "vV13 .req v4\n" "vV31 .req v4\n" "vU44 .req v4\n"
+        "qV33 .req q5\n" "qU46 .req q5\n" "qU11 .req q5\n" "qU37 .req q5\n"
+        "vV33 .req v5\n" "vU46 .req v5\n" "vU11 .req v5\n" "vU37 .req v5\n"
+        "qU56 .req q6\n" "qU25 .req q6\n" "qU32 .req q6\n"
+        "vU56 .req v6\n" "vU25 .req v6\n" "vU32 .req v6\n"
+        "qU72 .req q7\n" "qV22 .req q7\n" "vU72 .req v7\n" "vV22 .req v7\n"
+        "qU67 .req q8\n" "qU61 .req q8\n" "qU13 .req q8\n"
+        "vU67 .req v8\n" "vU61 .req v8\n" "vU13 .req v8\n"
+        "qU74 .req q9\n" "qU34 .req q9\n" "qU17 .req q9\n" "qU66 .req q9\n"
+        "vU74 .req v9\n" "vU34 .req v9\n" "vU17 .req v9\n" "vU66 .req v9\n"
+        "qU33 .req q10\n" "qU57 .req q10\n" "qU21 .req q10\n"
+        "vU33 .req v10\n" "vU57 .req v10\n" "vU21 .req v10\n" "qW23 .req q11\n"
+        "vW23 .req v11\n" "qU42 .req q12\n" "qV23 .req q12\n" "qU23 .req q12\n"
+        "vU42 .req v12\n" "vV23 .req v12\n" "vU23 .req v12\n"
+        "qW33 .req q13\n" "vW33 .req v13\n"
+        "qU76 .req q14\n" "qU47 .req q14\n" "qU64 .req q14\n" "qU41 .req q14\n"
+        "vU76 .req v14\n" "vU47 .req v14\n" "vU64 .req v14\n" "vU41 .req v14\n"
+        "qU52 .req q15\n" "qU54 .req q15\n" "qU75 .req q15\n" "qU26 .req q15\n"
+        "vU52 .req v15\n" "vU54 .req v15\n" "vU75 .req v15\n" "vU26 .req v15\n"
+        "qU53 .req q16\n" "qU27 .req q16\n" "vU53 .req v16\n" "vU27 .req v16\n"
+        "qV21 .req q17\n" "qU65 .req q17\n" "vV21 .req v17\n" "vU65 .req v17\n"
+        "qU31 .req q18\n" "qU24 .req q18\n" "qU36 .req q18\n"
+        "vU31 .req v18\n" "vU24 .req v18\n" "vU36 .req v18\n" "qU22 .req q19\n"
+        "vU22 .req v19\n" "qU35 .req q20\n" "qU63 .req q20\n"
+        "vU35 .req v20\n" "vU63 .req v20\n" "qW12 .req q21\n"
+        "vW12 .req v21\n" "qV32 .req q22\n" "qU16 .req q22\n"
+        "vV32 .req v22\n" "vU16 .req v22\n" "qW11 .req q23\n" "vW11 .req v23\n"
+        "qU12 .req q24\n" "vU12 .req v24\n" "qW31 .req q25\n" "vW31 .req v25\n"
+        "qW22 .req q26\n" "vW22 .req v26\n" "qU71 .req q27\n" "vU71 .req v27\n"
+        "qV11 .req q28\n" "vV11 .req v28\n" "qW21 .req q29\n" "vW21 .req v29\n"
+        "qW32 .req q30\n" "vW32 .req v30\n"
+
+        "uptr1 .req x0\n"
+        "uptr2 .req x1\n"
+        "uptr3 .req x2\n"
+        "uptr4 .req x3\n"
+        "uptr5 .req x4\n"
+        "uptr6 .req x5\n"
+        "u_col_stride1 .req %x[u_col_stride]\n"
+        "u_col_stride2 .req  x6\n"
+        "u_col_stride3 .req  x7\n"
+        "u_col_stride4 .req  x8\n"
+        "u_col_stride5 .req  x9\n"
+        "u_col_stride6 .req x10\n"
+        "wptr1 .req x11\n"
+        "wptr2 .req x12\n"
+        "w_col_stride1 .req %x[w_col_stride]\n"
+        "w_col_stride2 .req x13\n"
+        "vptr1 .req x14\n"
+        "vptr2 .req x15\n"
+        "v_col_stride1 .req %x[v_col_stride]\n"
+        "v_col_stride2 .req x16\n"
+
+        // Prepare strides and pointers
+        "add uptr1, %x[uptr0], %x[u_row_stride]\n"
+        "add uptr2,    uptr1 , %x[u_row_stride]\n"
+        "add uptr3,    uptr2 , %x[u_row_stride]\n"
+        "add uptr4,    uptr3 , %x[u_row_stride]\n"
+        "add uptr5,    uptr4 , %x[u_row_stride]\n"
+        "add uptr6,    uptr5 , %x[u_row_stride]\n"
+        "add u_col_stride2, u_col_stride1, u_col_stride1\n"
+        "add u_col_stride3, u_col_stride2, u_col_stride1\n"
+        "add u_col_stride4, u_col_stride3, u_col_stride1\n"
+        "add u_col_stride5, u_col_stride4, u_col_stride1\n"
+        "add u_col_stride6, u_col_stride5, u_col_stride1\n"
+
+        "add wptr1, %x[wptr0], %x[w_row_stride]\n"
+        "add wptr2,    wptr1 , %x[w_row_stride]\n"
+        "add w_col_stride2, w_col_stride1, w_col_stride1\n"
+
+        "add vptr1, %x[vptr0], %x[v_row_stride]\n"
+        "add vptr2,    vptr1 , %x[v_row_stride]\n"
+        "add v_col_stride2, v_col_stride1, v_col_stride1\n"
+
+        // Prepare for first iteration
+        "ldr qW13, [%x[wptr0], w_col_stride2]\n"
+        "ldr qW23, [wptr1, w_col_stride2]\n"
+        "ldr qW33, [wptr2, w_col_stride2]\n"
+        "ldr qW12, [%x[wptr0], w_col_stride1]\n"
+        "ldr qW22, [wptr1, w_col_stride1]\n"
+        "ldr qW32, [wptr2, w_col_stride1]\n"
+        "ldr qW11, [%x[wptr0]], #0x10\n"
+        "ldr qW21, [wptr1], #0x10\n"
+        "ldr qU17, [%x[uptr0], u_col_stride6]\n"
+        "ldr qU15, [%x[uptr0], u_col_stride4]\n"
+        "ldr qU16, [%x[uptr0], u_col_stride5]\n"
+        "ldr qU37, [uptr2, u_col_stride6]\n"
+        "ldr qU35, [uptr2, u_col_stride4]\n"
+        "ldr qU36, [uptr2, u_col_stride5]\n"
+        "ldr qU27, [uptr1, u_col_stride6]\n"
+        "ldr qU25, [uptr1, u_col_stride4]\n"
+        "fmul vV13.4s, vU17.4s, vW13.4s\n"
+        "fmul vV12.4s, vU15.4s, vW13.4s\n"
+        "fmla vV13.4s, vU15.4s, vW11.4s\n"
+        "ldr qW31, [wptr2], #0x10\n"
+        "fmla vV13.4s, vU16.4s, vW12.4s\n"
+        "ldr qU26, [uptr1, u_col_stride5]\n"
+        "fmla vV13.4s, vU37.4s, vW33.4s\n"
+        "ldr qU47, [uptr3, u_col_stride6]\n"
+        "fmul vV23.4s, vU37.4s, vW13.4s\n"
+        "ldr qU45, [uptr3, u_col_stride4]\n"
+        "fmla vV12.4s, vU35.4s, vW33.4s\n"
+        "ldr qU46, [uptr3, u_col_stride5]\n"
+        "fmla vV13.4s, vU35.4s, vW31.4s\n"
+        "ldr qU67, [uptr5, u_col_stride6]\n"
+        "fmul vV22.4s, vU35.4s, vW13.4s\n"
+        "cbz %x[n_iters], 2f\n"  // Jump to tail if no iterations
+
+        "1:"  // Loop body
+        "fmla vV23.4s, vU35.4s, vW11.4s\n"
+        "ldr qU65, [uptr5, u_col_stride4]\n"
+        "fmla vV13.4s, vU36.4s, vW32.4s\n"
+        "fmla vV23.4s, vU36.4s, vW12.4s\n"
+        "ldr qU66, [uptr5, u_col_stride5]\n"
+        "fmla vV13.4s, vU27.4s, vW23.4s\n"
+        "ldr qU57, [uptr4, u_col_stride6]\n"
+        "fmla vV12.4s, vU25.4s, vW23.4s\n"
+        "ldr qU55, [uptr4, u_col_stride4]\n"
+        "fmla vV13.4s, vU25.4s, vW21.4s\n"
+        "ldr qU56, [uptr4, u_col_stride5]\n"
+        "fmla vV13.4s, vU26.4s, vW22.4s\n"
+        "str qV13, [%x[vptr0], v_col_stride2]\n"
+        "fmla vV23.4s, vU47.4s, vW23.4s\n"
+        "ldr qU77, [uptr6, u_col_stride6]\n"
+        "fmla vV22.4s, vU45.4s, vW23.4s\n"
+        "fmla vV23.4s, vU45.4s, vW21.4s\n"
+        "ldr qU75, [uptr6, u_col_stride4]\n"
+        "fmla vV23.4s, vU46.4s, vW22.4s\n"
+        "ldr qU76, [uptr6, u_col_stride5]\n"
+        "fmul vV33.4s, vU67.4s, vW23.4s\n"
+        "ldr qU14, [%x[uptr0], u_col_stride3]\n"
+        "fmul vV32.4s, vU65.4s, vW23.4s\n"
+        "fmla vV33.4s, vU65.4s, vW21.4s\n"
+        "ldr qU13, [%x[uptr0], u_col_stride2]\n"
+        "fmla vV33.4s, vU66.4s, vW22.4s\n"
+        "ldr qU34, [uptr2, u_col_stride3]\n"
+        "fmla vV23.4s, vU57.4s, vW33.4s\n"
+        "fmla vV33.4s, vU57.4s, vW13.4s\n"
+        "ldr qU33, [uptr2, u_col_stride2]\n"
+        "fmla vV22.4s, vU55.4s, vW33.4s\n"
+        "fmla vV23.4s, vU55.4s, vW31.4s\n"
+        "fmla vV32.4s, vU55.4s, vW13.4s\n"
+        "fmla vV33.4s, vU55.4s, vW11.4s\n"
+        "ldr qU24, [uptr1, u_col_stride3]\n"
+        "fmla vV23.4s, vU56.4s, vW32.4s\n"
+        "str qV23, [vptr1, v_col_stride2]\n"
+        "fmla vV33.4s, vU56.4s, vW12.4s\n"
+        "ldr qU23, [uptr1, u_col_stride2]\n"
+        "fmla vV33.4s, vU77.4s, vW33.4s\n"
+        "ldr qU44, [uptr3, u_col_stride3]\n"
+        "fmla vV32.4s, vU75.4s, vW33.4s\n"
+        "fmla vV33.4s, vU75.4s, vW31.4s\n"
+        "ldr qU43, [uptr3, u_col_stride2]\n"
+        "fmla vV33.4s, vU76.4s, vW32.4s\n"
+        "str qV33, [vptr2, v_col_stride2]\n"
+        "ldr qU64, [uptr5, u_col_stride3]\n"
+        "fmla vV12.4s, vU14.4s, vW12.4s\n"
+        "ldr qU63, [uptr5, u_col_stride2]\n"
+        "fmul vV11.4s, vU13.4s, vW13.4s\n"
+        "fmla vV12.4s, vU13.4s, vW11.4s\n"
+        "ldr qU54, [uptr4, u_col_stride3]\n"
+        "fmla vV12.4s, vU34.4s, vW32.4s\n"
+        "fmla vV22.4s, vU34.4s, vW12.4s\n"
+        "ldr qU53, [uptr4, u_col_stride2]\n"
+        "fmla vV11.4s, vU33.4s, vW33.4s\n"
+        "ldr qU74, [uptr6, u_col_stride3]\n"
+        "fmla vV12.4s, vU33.4s, vW31.4s\n"
+        "ldr qU73, [uptr6, u_col_stride2]\n"
+        "fmul vV21.4s, vU33.4s, vW13.4s\n"
+        "ldr qU12, [%x[uptr0], u_col_stride1]\n"
+        "fmla vV22.4s, vU33.4s, vW11.4s\n"
+        "ldr qU11, [%x[uptr0]], #0x10\n"
+        "fmla vV12.4s, vU24.4s, vW22.4s\n"
+        "ldr qU32, [uptr2, u_col_stride1]\n"
+        "fmla vV11.4s, vU23.4s, vW23.4s\n"
+        "ldr qU31, [uptr2], #0x10\n"
+        "fmla vV12.4s, vU23.4s, vW21.4s\n"
+        "str qV12, [%x[vptr0], v_col_stride1]\n"
+        "fmla vV22.4s, vU44.4s, vW22.4s\n"
+        "ldr qU22, [uptr1, u_col_stride1]\n"
+        "fmla vV21.4s, vU43.4s, vW23.4s\n"
+        "ldr qU21, [uptr1], #0x10\n"
+        "fmla vV22.4s, vU43.4s, vW21.4s\n"
+        "ldr qU42, [uptr3, u_col_stride1]\n"
+        "fmla vV32.4s, vU64.4s, vW22.4s\n"
+        "ldr qU41, [uptr3], #0x10\n"
+        "fmul vV31.4s, vU63.4s, vW23.4s\n"
+        "ldr qW23, [wptr1, w_col_stride2]\n"
+        "fmla vV32.4s, vU63.4s, vW21.4s\n"
+        "ldr qU62, [uptr5, u_col_stride1]\n"
+        "fmla vV22.4s, vU54.4s, vW32.4s\n"
+        "ldr qU61, [uptr5], #0x10\n"
+        "fmla vV32.4s, vU54.4s, vW12.4s\n"
+        "ldr qU52, [uptr4, u_col_stride1]\n"
+        "fmla vV21.4s, vU53.4s, vW33.4s\n"
+        "ldr qU51, [uptr4], #0x10\n"
+        "fmla vV22.4s, vU53.4s, vW31.4s\n"
+        "str qV22, [vptr1, v_col_stride1]\n"
+        "fmla vV31.4s, vU53.4s, vW13.4s\n"
+        "ldr qW13, [%x[wptr0], w_col_stride2]\n"
+        "fmla vV32.4s, vU53.4s, vW11.4s\n"
+        "ldr qU72, [uptr6, u_col_stride1]\n"
+        "fmla vV32.4s, vU74.4s, vW32.4s\n"
+        "ldr qU71, [uptr6], #0x10\n"
+        "fmla vV31.4s, vU73.4s, vW33.4s\n"
+        "ldr qW33, [wptr2, w_col_stride2]\n"
+        "fmla vV32.4s, vU73.4s, vW31.4s\n"
+        "str qV32, [vptr2, v_col_stride1]\n"
+        "fmla vV11.4s, vU12.4s, vW12.4s\n"
+        "ldr qU17, [%x[uptr0], u_col_stride6]\n"
+        "fmla vV11.4s, vU11.4s, vW11.4s\n"
+        "ldr qU15, [%x[uptr0], u_col_stride4]\n"
+        "fmla vV11.4s, vU32.4s, vW32.4s\n"
+        "ldr qU16, [%x[uptr0], u_col_stride5]\n"
+        "fmla vV21.4s, vU32.4s, vW12.4s\n"
+        "ldr qU37, [uptr2, u_col_stride6]\n"
+        "fmla vV11.4s, vU31.4s, vW31.4s\n"
+        "ldr qU35, [uptr2, u_col_stride4]\n"
+        "fmla vV21.4s, vU31.4s, vW11.4s\n"
+        "ldr qU36, [uptr2, u_col_stride5]\n"
+        "fmla vV11.4s, vU22.4s, vW22.4s\n"
+        "ldr qU27, [uptr1, u_col_stride6]\n"
+        "fmla vV11.4s, vU21.4s, vW21.4s\n"
+        "str qV11, [%x[vptr0]], #0x10\n"
+        "fmla vV21.4s, vU42.4s, vW22.4s\n"
+        "ldr qU25, [uptr1, u_col_stride4]\n"
+        "fmla vV21.4s, vU41.4s, vW21.4s\n"
+        "fmla vV31.4s, vU62.4s, vW22.4s\n"
+        "ldr qW22, [wptr1, w_col_stride1]\n"
+        "fmla vV31.4s, vU61.4s, vW21.4s\n"
+        "ldr qW21, [wptr1], #0x10\n"
+        "fmla vV21.4s, vU52.4s, vW32.4s\n"
+        "fmla vV31.4s, vU52.4s, vW12.4s\n"
+        "ldr qW12, [%x[wptr0], w_col_stride1]\n"
+        "fmla vV21.4s, vU51.4s, vW31.4s\n"
+        "str qV21, [vptr1], #0x10\n"
+        "fmla vV31.4s, vU51.4s, vW11.4s\n"
+        "ldr qW11, [%x[wptr0]], #0x10\n"
+        "fmla vV31.4s, vU72.4s, vW32.4s\n"
+        "ldr qW32, [wptr2, w_col_stride1]\n"
+        "fmla vV31.4s, vU71.4s, vW31.4s\n"
+        "str qV31, [vptr2], #0x10\n"
+        "fmul vV13.4s, vU17.4s, vW13.4s\n"
+        "fmul vV12.4s, vU15.4s, vW13.4s\n"
+        "subs %x[n_iters], %x[n_iters], #1\n"
+        "fmla vV13.4s, vU15.4s, vW11.4s\n"
+        "ldr qW31, [wptr2], #0x10\n"
+        "fmla vV13.4s, vU16.4s, vW12.4s\n"
+        "ldr qU26, [uptr1, u_col_stride5]\n"
+        "fmla vV13.4s, vU37.4s, vW33.4s\n"
+        "ldr qU47, [uptr3, u_col_stride6]\n"
+        "fmul vV23.4s, vU37.4s, vW13.4s\n"
+        "ldr qU45, [uptr3, u_col_stride4]\n"
+        "fmla vV12.4s, vU35.4s, vW33.4s\n"
+        "ldr qU46, [uptr3, u_col_stride5]\n"
+        "fmla vV13.4s, vU35.4s, vW31.4s\n"
+        "ldr qU67, [uptr5, u_col_stride6]\n"
+        "fmul vV22.4s, vU35.4s, vW13.4s\n"
+        "bne 1b\n"
+
+        "2:"  // Tail iteration
+        "fmla vV23.4s, vU35.4s, vW11.4s\n"
+        "ldr qU65, [uptr5, u_col_stride4]\n"
+        "fmla vV13.4s, vU36.4s, vW32.4s\n"
+        "fmla vV23.4s, vU36.4s, vW12.4s\n"
+        "ldr qU66, [uptr5, u_col_stride5]\n"
+        "fmla vV13.4s, vU27.4s, vW23.4s\n"
+        "ldr qU57, [uptr4, u_col_stride6]\n"
+        "fmla vV12.4s, vU25.4s, vW23.4s\n"
+        "ldr qU55, [uptr4, u_col_stride4]\n"
+        "fmla vV13.4s, vU25.4s, vW21.4s\n"
+        "ldr qU56, [uptr4, u_col_stride5]\n"
+        "fmla vV13.4s, vU26.4s, vW22.4s\n"
+        "str qV13, [%x[vptr0], v_col_stride2]\n"
+        "fmla vV23.4s, vU47.4s, vW23.4s\n"
+        "ldr qU77, [uptr6, u_col_stride6]\n"
+        "fmla vV22.4s, vU45.4s, vW23.4s\n"
+        "fmla vV23.4s, vU45.4s, vW21.4s\n"
+        "ldr qU75, [uptr6, u_col_stride4]\n"
+        "fmla vV23.4s, vU46.4s, vW22.4s\n"
+        "ldr qU76, [uptr6, u_col_stride5]\n"
+        "fmul vV33.4s, vU67.4s, vW23.4s\n"
+        "ldr qU14, [%x[uptr0], u_col_stride3]\n"
+        "fmul vV32.4s, vU65.4s, vW23.4s\n"
+        "fmla vV33.4s, vU65.4s, vW21.4s\n"
+        "ldr qU13, [%x[uptr0], u_col_stride2]\n"
+        "fmla vV33.4s, vU66.4s, vW22.4s\n"
+        "ldr qU34, [uptr2, u_col_stride3]\n"
+        "fmla vV23.4s, vU57.4s, vW33.4s\n"
+        "fmla vV33.4s, vU57.4s, vW13.4s\n"
+        "ldr qU33, [uptr2, u_col_stride2]\n"
+        "fmla vV22.4s, vU55.4s, vW33.4s\n"
+        "fmla vV23.4s, vU55.4s, vW31.4s\n"
+        "fmla vV32.4s, vU55.4s, vW13.4s\n"
+        "fmla vV33.4s, vU55.4s, vW11.4s\n"
+        "ldr qU24, [uptr1, u_col_stride3]\n"
+        "fmla vV23.4s, vU56.4s, vW32.4s\n"
+        "str qV23, [vptr1, v_col_stride2]\n"
+        "fmla vV33.4s, vU56.4s, vW12.4s\n"
+        "ldr qU23, [uptr1, u_col_stride2]\n"
+        "fmla vV33.4s, vU77.4s, vW33.4s\n"
+        "ldr qU44, [uptr3, u_col_stride3]\n"
+        "fmla vV32.4s, vU75.4s, vW33.4s\n"
+        "fmla vV33.4s, vU75.4s, vW31.4s\n"
+        "ldr qU43, [uptr3, u_col_stride2]\n"
+        "fmla vV33.4s, vU76.4s, vW32.4s\n"
+        "str qV33, [vptr2, v_col_stride2]\n"
+        "ldr qU64, [uptr5, u_col_stride3]\n"
+        "fmla vV12.4s, vU14.4s, vW12.4s\n"
+        "ldr qU63, [uptr5, u_col_stride2]\n"
+        "fmul vV11.4s, vU13.4s, vW13.4s\n"
+        "fmla vV12.4s, vU13.4s, vW11.4s\n"
+        "ldr qU54, [uptr4, u_col_stride3]\n"
+        "fmla vV12.4s, vU34.4s, vW32.4s\n"
+        "fmla vV22.4s, vU34.4s, vW12.4s\n"
+        "ldr qU53, [uptr4, u_col_stride2]\n"
+        "fmla vV11.4s, vU33.4s, vW33.4s\n"
+        "ldr qU74, [uptr6, u_col_stride3]\n"
+        "fmla vV12.4s, vU33.4s, vW31.4s\n"
+        "ldr qU73, [uptr6, u_col_stride2]\n"
+        "fmul vV21.4s, vU33.4s, vW13.4s\n"
+        "ldr qU12, [%x[uptr0], u_col_stride1]\n"
+        "fmla vV22.4s, vU33.4s, vW11.4s\n"
+        "ldr qU11, [%x[uptr0]], #0x10\n"
+        "fmla vV12.4s, vU24.4s, vW22.4s\n"
+        "ldr qU32, [uptr2, u_col_stride1]\n"
+        "fmla vV11.4s, vU23.4s, vW23.4s\n"
+        "ldr qU31, [uptr2], #0x10\n"
+        "fmla vV12.4s, vU23.4s, vW21.4s\n"
+        "str qV12, [%x[vptr0], v_col_stride1]\n"
+        "fmla vV22.4s, vU44.4s, vW22.4s\n"
+        "ldr qU22, [uptr1, u_col_stride1]\n"
+        "fmla vV21.4s, vU43.4s, vW23.4s\n"
+        "ldr qU21, [uptr1], #0x10\n"
+        "fmla vV22.4s, vU43.4s, vW21.4s\n"
+        "ldr qU42, [uptr3, u_col_stride1]\n"
+        "fmla vV32.4s, vU64.4s, vW22.4s\n"
+        "ldr qU41, [uptr3], #0x10\n"
+        "fmul vV31.4s, vU63.4s, vW23.4s\n"
+        "fmla vV32.4s, vU63.4s, vW21.4s\n"
+        "ldr qU62, [uptr5, u_col_stride1]\n"
+        "fmla vV22.4s, vU54.4s, vW32.4s\n"
+        "ldr qU61, [uptr5], #0x10\n"
+        "fmla vV32.4s, vU54.4s, vW12.4s\n"
+        "ldr qU52, [uptr4, u_col_stride1]\n"
+        "fmla vV21.4s, vU53.4s, vW33.4s\n"
+        "ldr qU51, [uptr4], #0x10\n"
+        "fmla vV22.4s, vU53.4s, vW31.4s\n"
+        "str qV22, [vptr1, v_col_stride1]\n"
+        "fmla vV31.4s, vU53.4s, vW13.4s\n"
+        "fmla vV32.4s, vU53.4s, vW11.4s\n"
+        "ldr qU72, [uptr6, u_col_stride1]\n"
+        "fmla vV32.4s, vU74.4s, vW32.4s\n"
+        "ldr qU71, [uptr6], #0x10\n"
+        "fmla vV31.4s, vU73.4s, vW33.4s\n"
+        "fmla vV32.4s, vU73.4s, vW31.4s\n"
+        "str qV32, [vptr2, v_col_stride1]\n"
+        "fmla vV11.4s, vU12.4s, vW12.4s\n"
+        "fmla vV11.4s, vU11.4s, vW11.4s\n"
+        "fmla vV11.4s, vU32.4s, vW32.4s\n"
+        "fmla vV21.4s, vU32.4s, vW12.4s\n"
+        "fmla vV11.4s, vU31.4s, vW31.4s\n"
+        "fmla vV21.4s, vU31.4s, vW11.4s\n"
+        "fmla vV11.4s, vU22.4s, vW22.4s\n"
+        "fmla vV11.4s, vU21.4s, vW21.4s\n"
+        "str qV11, [%x[vptr0]], #0x10\n"
+        "fmla vV21.4s, vU42.4s, vW22.4s\n"
+        "fmla vV21.4s, vU41.4s, vW21.4s\n"
+        "fmla vV31.4s, vU62.4s, vW22.4s\n"
+        "fmla vV31.4s, vU61.4s, vW21.4s\n"
+        "fmla vV21.4s, vU52.4s, vW32.4s\n"
+        "fmla vV31.4s, vU52.4s, vW12.4s\n"
+        "fmla vV21.4s, vU51.4s, vW31.4s\n"
+        "str qV21, [vptr1], #0x10\n"
+        "fmla vV31.4s, vU51.4s, vW11.4s\n"
+        "fmla vV31.4s, vU72.4s, vW32.4s\n"
+        "fmla vV31.4s, vU71.4s, vW31.4s\n"
+        "str qV31, [vptr2], #0x10\n"
+
+        // Clear aliases
+        ".unreq uptr1\n" ".unreq uptr2\n" ".unreq uptr3\n" ".unreq uptr4\n"
+        ".unreq uptr5\n" ".unreq uptr6\n"
+        ".unreq u_col_stride1\n" ".unreq u_col_stride2\n" ".unreq u_col_stride3\n"
+        ".unreq u_col_stride4\n" ".unreq u_col_stride5\n" ".unreq u_col_stride6\n"
+        ".unreq wptr1\n" ".unreq wptr2\n"
+        ".unreq w_col_stride1\n" ".unreq w_col_stride2\n"
+        ".unreq vptr1\n" ".unreq vptr2\n"
+        ".unreq v_col_stride1\n" ".unreq v_col_stride2\n"
+        ".unreq qU15\n" ".unreq qU73\n" ".unreq qU45\n" ".unreq qU14\n"
+        ".unreq qW13\n" ".unreq qU62\n" ".unreq qV12\n"
+        ".unreq qU51\n" ".unreq qU43\n" ".unreq qU55\n"
+        ".unreq qU77\n" ".unreq qV13\n" ".unreq qV31\n" ".unreq qU44\n"
+        ".unreq qV33\n" ".unreq qU46\n" ".unreq qU11\n" ".unreq qU37\n"
+        ".unreq qU56\n" ".unreq qU25\n" ".unreq qU32\n"
+        ".unreq qU72\n" ".unreq qV22\n"
+        ".unreq qU67\n" ".unreq qU61\n" ".unreq qU13\n" ".unreq qW33\n"
+        ".unreq qU74\n" ".unreq qU34\n" ".unreq qU17\n" ".unreq qU66\n"
+        ".unreq qU33\n" ".unreq qU57\n" ".unreq qU21\n"
+        ".unreq qW23\n" ".unreq qU42\n" ".unreq qV23\n" ".unreq qU23\n"
+        ".unreq qU76\n" ".unreq qU47\n" ".unreq qU64\n" ".unreq qU41\n"
+        ".unreq qU52\n" ".unreq qU54\n" ".unreq qU75\n" ".unreq qU26\n"
+        ".unreq qU53\n" ".unreq qU27\n"
+        ".unreq qV21\n" ".unreq qU65\n"
+        ".unreq qU31\n" ".unreq qU24\n" ".unreq qU36\n" ".unreq qU22\n"
+        ".unreq qU35\n" ".unreq qU63\n" ".unreq qW12\n"
+        ".unreq qV32\n" ".unreq qU16\n" ".unreq qW11\n" ".unreq qU12\n"
+        ".unreq qW31\n" ".unreq qW22\n" ".unreq qU71\n" ".unreq qV11\n"
+        ".unreq qW21\n" ".unreq qW32\n" ".unreq vW13\n"
+        ".unreq vU15\n" ".unreq vU73\n" ".unreq vU45\n" ".unreq vU14\n"
+        ".unreq vU62\n" ".unreq vV12\n"
+        ".unreq vU51\n" ".unreq vU43\n" ".unreq vU55\n"
+        ".unreq vU77\n" ".unreq vV13\n" ".unreq vV31\n" ".unreq vU44\n"
+        ".unreq vV33\n" ".unreq vU46\n" ".unreq vU11\n" ".unreq vU37\n"
+        ".unreq vU56\n" ".unreq vU25\n" ".unreq vU32\n"
+        ".unreq vU72\n" ".unreq vV22\n" ".unreq vW21\n" ".unreq vW32\n"
+        ".unreq vU67\n" ".unreq vU61\n" ".unreq vU13\n"
+        ".unreq vU74\n" ".unreq vU34\n" ".unreq vU17\n" ".unreq vU66\n"
+        ".unreq vU33\n" ".unreq vU57\n" ".unreq vU21\n" ".unreq vW23\n"
+        ".unreq vU42\n" ".unreq vV23\n" ".unreq vU23\n" ".unreq vW33\n"
+        ".unreq vU76\n" ".unreq vU47\n" ".unreq vU64\n" ".unreq vU41\n"
+        ".unreq vU52\n" ".unreq vU54\n" ".unreq vU75\n" ".unreq vU26\n"
+        ".unreq vU53\n" ".unreq vU27\n" ".unreq vV21\n" ".unreq vU65\n"
+        ".unreq vU31\n" ".unreq vU24\n" ".unreq vU36\n" ".unreq vU22\n"
+        ".unreq vU35\n" ".unreq vU63\n" ".unreq vW12\n"
+        ".unreq vV32\n" ".unreq vU16\n" ".unreq vW11\n" ".unreq vU12\n"
+        ".unreq vW31\n" ".unreq vW22\n" ".unreq vU71\n" ".unreq vV11\n"
+        : [uptr0] "+r" (uptr0), [wptr0] "+r" (wptr0), [vptr0] "+r" (vptr0),
+          [n_iters] "+r" (n_iters)
+        : [u_row_stride] "r" (in_row_stride * sizeof(float)),
+          [u_col_stride] "r" (in_col_stride * sizeof(float)),
+          [w_row_stride] "r" (weight_row_stride * sizeof(float)),
+          [w_col_stride] "r" (weight_col_stride * sizeof(float)),
+          [v_row_stride] "r" (out_row_stride * sizeof(float)),
+          [v_col_stride] "r" (out_col_stride * sizeof(float))
+        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+          "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
+          "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0",
+          "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11",
+          "x12", "x13", "x14", "x15", "x16", "cc", "memory"
+    );
+  }
+  if (channels_remaining)
+  {
+    // Fall back on the unoptimised version to clean up the tail
+    ConvImpl::process_tile<false>(
+        channels_remaining,
+        wptr0, weight_row_stride, weight_col_stride,
+        uptr0, in_row_stride, in_col_stride,
+        vptr0, out_row_stride, out_col_stride,
+        0, 0, 0, 0, 0, 0
+    );
+  }
+}
+
+#endif  // __aarch64__
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+  ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+  ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
 };
 
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+  ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+  ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
+  },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
+  },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
 
 template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>;
 }  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
index a1aaaa0..44b93a1 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp

@@ -28,2668 +28,1465 @@
 using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>;
 using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, float, float>;
 
+#ifdef __aarch64__
+
 template <>
-const Conv::TileFn Conv::tile_fns
-  [max_in_pad_top]
-  [max_in_pad_left]
-  [max_in_pad_bottom]
-  [max_in_pad_right]
-  [max_out_pad_bottom]
-  [max_out_pad_right] = {
-  {  // Input pad top = 0
-    {  // Input pad left = 0
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 0, 0, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 0, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 0, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 0, 0, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 0, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 0, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 0, 0, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 0, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 0, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 0, 0, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 0, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 0, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 0, 1, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 1, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 1, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 0, 1, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 1, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 1, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 0, 1, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 1, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 1, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 0, 1, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 1, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 1, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 0, 2, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 2, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 2, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 0, 2, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 2, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 2, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 0, 2, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 2, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 2, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 0, 2, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 2, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 2, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 0, 3, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 3, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 3, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 0, 3, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 3, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 3, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 0, 3, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 3, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 3, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 0, 3, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 3, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 3, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 0, 4, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 4, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 4, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 0, 4, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 4, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 4, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 0, 4, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 4, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 4, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 0, 4, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 0, 4, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 0, 4, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 0, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 1, 0, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 0, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 0, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 1, 0, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 0, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 0, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 1, 0, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 0, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 0, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 1, 0, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 0, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 0, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 1, 1, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 1, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 1, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 1, 1, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 1, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 1, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 1, 1, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 1, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 1, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 1, 1, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 1, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 1, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 1, 2, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 2, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 2, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 1, 2, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 2, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 2, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 1, 2, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 2, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 2, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 1, 2, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 2, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 2, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 1, 3, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 3, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 3, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 1, 3, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 3, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 3, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 1, 3, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 3, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 3, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 1, 3, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 3, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 3, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 1, 4, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 4, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 4, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 1, 4, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 4, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 4, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 1, 4, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 4, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 4, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 1, 4, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 1, 4, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 1, 4, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 1, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 2, 0, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 0, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 0, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 2, 0, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 0, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 0, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 2, 0, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 0, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 0, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 2, 0, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 0, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 0, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 2, 1, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 1, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 1, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 2, 1, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 1, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 1, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 2, 1, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 1, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 1, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 2, 1, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 1, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 1, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 2, 2, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 2, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 2, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 2, 2, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 2, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 2, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 2, 2, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 2, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 2, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 2, 2, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 2, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 2, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 2, 3, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 3, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 3, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 2, 3, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 3, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 3, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 2, 3, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 3, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 3, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 2, 3, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 3, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 3, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 2, 4, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 4, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 4, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 2, 4, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 4, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 4, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 2, 4, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 4, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 4, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 2, 4, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 2, 4, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 2, 4, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 2, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 3, 0, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 0, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 0, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 3, 0, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 0, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 0, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 3, 0, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 0, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 0, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 3, 0, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 0, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 0, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 3, 1, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 1, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 1, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 3, 1, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 1, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 1, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 3, 1, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 1, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 1, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 3, 1, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 1, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 1, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 3, 2, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 2, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 2, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 3, 2, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 2, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 2, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 3, 2, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 2, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 2, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 3, 2, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 2, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 2, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 3, 3, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 3, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 3, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 3, 3, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 3, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 3, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 3, 3, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 3, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 3, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 3, 3, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 3, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 3, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 3, 4, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 4, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 4, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 3, 4, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 4, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 4, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 3, 4, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 4, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 4, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 3, 4, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 3, 4, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 3, 4, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 3, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 3
-      {  // Input pad bottom = 4
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 4, 0, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 0, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 0, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 4, 0, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 0, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 0, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 4, 0, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 0, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 0, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 4, 0, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 0, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 0, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 4, 1, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 1, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 1, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 4, 1, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 1, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 1, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 4, 1, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 1, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 1, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 4, 1, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 1, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 1, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 4, 2, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 2, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 2, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 4, 2, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 2, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 2, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 4, 2, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 2, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 2, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 4, 2, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 2, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 2, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 4, 3, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 3, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 3, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 4, 3, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 3, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 3, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 4, 3, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 3, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 3, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 4, 3, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 3, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 3, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 0, 4, 4, 0, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 4, 0, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 4, 0, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 0, 4, 4, 1, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 4, 1, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 4, 1, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 0, 4, 4, 2, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 4, 2, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 4, 2, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 0, 4, 4, 3, 0>,
-            ConvImpl::template process_tile<0, 0, 4, 4, 3, 1>,
-            ConvImpl::template process_tile<0, 0, 4, 4, 3, 2>,
-            ConvImpl::template process_tile<0, 0, 4, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 4
-    },  // Input pad left = 0
-    {  // Input pad left = 1
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 0, 0, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 0, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 0, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 0, 0, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 0, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 0, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 0, 0, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 0, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 0, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 0, 0, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 0, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 0, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 0, 1, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 1, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 1, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 0, 1, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 1, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 1, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 0, 1, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 1, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 1, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 0, 1, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 1, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 1, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 0, 2, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 2, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 2, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 0, 2, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 2, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 2, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 0, 2, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 2, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 2, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 0, 2, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 2, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 2, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 0, 3, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 3, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 3, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 0, 3, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 3, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 3, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 0, 3, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 3, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 3, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 0, 3, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 3, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 3, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 0, 4, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 4, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 4, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 0, 4, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 4, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 4, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 0, 4, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 4, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 4, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 0, 4, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 0, 4, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 0, 4, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 0, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 1, 0, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 0, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 0, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 1, 0, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 0, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 0, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 1, 0, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 0, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 0, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 1, 0, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 0, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 0, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 1, 1, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 1, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 1, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 1, 1, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 1, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 1, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 1, 1, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 1, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 1, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 1, 1, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 1, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 1, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 1, 2, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 2, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 2, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 1, 2, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 2, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 2, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 1, 2, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 2, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 2, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 1, 2, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 2, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 2, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 1, 3, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 3, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 3, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 1, 3, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 3, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 3, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 1, 3, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 3, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 3, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 1, 3, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 3, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 3, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 1, 4, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 4, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 4, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 1, 4, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 4, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 4, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 1, 4, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 4, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 4, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 1, 4, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 1, 4, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 1, 4, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 1, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 2, 0, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 0, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 0, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 2, 0, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 0, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 0, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 2, 0, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 0, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 0, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 2, 0, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 0, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 0, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 2, 1, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 1, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 1, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 2, 1, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 1, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 1, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 2, 1, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 1, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 1, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 2, 1, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 1, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 1, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 2, 2, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 2, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 2, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 2, 2, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 2, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 2, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 2, 2, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 2, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 2, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 2, 2, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 2, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 2, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 2, 3, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 3, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 3, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 2, 3, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 3, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 3, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 2, 3, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 3, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 3, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 2, 3, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 3, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 3, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 2, 4, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 4, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 4, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 2, 4, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 4, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 4, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 2, 4, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 4, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 4, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 2, 4, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 2, 4, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 2, 4, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 2, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 3, 0, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 0, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 0, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 3, 0, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 0, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 0, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 3, 0, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 0, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 0, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 3, 0, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 0, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 0, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 3, 1, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 1, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 1, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 3, 1, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 1, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 1, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 3, 1, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 1, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 1, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 3, 1, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 1, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 1, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 3, 2, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 2, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 2, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 3, 2, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 2, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 2, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 3, 2, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 2, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 2, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 3, 2, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 2, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 2, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 3, 3, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 3, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 3, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 3, 3, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 3, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 3, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 3, 3, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 3, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 3, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 3, 3, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 3, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 3, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 3, 4, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 4, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 4, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 3, 4, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 4, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 4, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 3, 4, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 4, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 4, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 3, 4, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 3, 4, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 3, 4, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 3, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 3
-      {  // Input pad bottom = 4
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 4, 0, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 0, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 0, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 4, 0, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 0, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 0, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 4, 0, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 0, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 0, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 4, 0, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 0, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 0, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 4, 1, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 1, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 1, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 4, 1, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 1, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 1, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 4, 1, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 1, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 1, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 4, 1, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 1, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 1, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 4, 2, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 2, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 2, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 4, 2, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 2, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 2, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 4, 2, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 2, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 2, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 4, 2, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 2, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 2, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 4, 3, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 3, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 3, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 4, 3, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 3, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 3, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 4, 3, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 3, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 3, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 4, 3, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 3, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 3, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<0, 1, 4, 4, 0, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 4, 0, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 4, 0, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<0, 1, 4, 4, 1, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 4, 1, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 4, 1, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<0, 1, 4, 4, 2, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 4, 2, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 4, 2, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<0, 1, 4, 4, 3, 0>,
-            ConvImpl::template process_tile<0, 1, 4, 4, 3, 1>,
-            ConvImpl::template process_tile<0, 1, 4, 4, 3, 2>,
-            ConvImpl::template process_tile<0, 1, 4, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 4
-    },  // Input pad left = 1
-  },  // Input pad top = 0
-  {  // Input pad top = 1
-    {  // Input pad left = 0
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 0, 0, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 0, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 0, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 0, 0, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 0, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 0, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 0, 0, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 0, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 0, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 0, 0, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 0, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 0, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 0, 1, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 1, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 1, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 0, 1, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 1, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 1, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 0, 1, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 1, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 1, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 0, 1, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 1, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 1, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 0, 2, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 2, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 2, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 0, 2, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 2, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 2, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 0, 2, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 2, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 2, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 0, 2, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 2, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 2, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 0, 3, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 3, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 3, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 0, 3, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 3, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 3, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 0, 3, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 3, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 3, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 0, 3, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 3, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 3, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 0, 4, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 4, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 4, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 0, 4, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 4, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 4, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 0, 4, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 4, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 4, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 0, 4, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 0, 4, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 0, 4, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 0, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 1, 0, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 0, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 0, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 1, 0, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 0, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 0, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 1, 0, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 0, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 0, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 1, 0, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 0, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 0, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 1, 1, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 1, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 1, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 1, 1, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 1, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 1, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 1, 1, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 1, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 1, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 1, 1, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 1, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 1, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 1, 2, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 2, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 2, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 1, 2, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 2, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 2, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 1, 2, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 2, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 2, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 1, 2, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 2, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 2, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 1, 3, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 3, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 3, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 1, 3, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 3, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 3, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 1, 3, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 3, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 3, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 1, 3, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 3, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 3, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 1, 4, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 4, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 4, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 1, 4, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 4, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 4, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 1, 4, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 4, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 4, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 1, 4, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 1, 4, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 1, 4, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 1, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 2, 0, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 0, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 0, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 2, 0, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 0, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 0, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 2, 0, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 0, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 0, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 2, 0, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 0, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 0, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 2, 1, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 1, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 1, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 2, 1, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 1, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 1, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 2, 1, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 1, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 1, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 2, 1, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 1, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 1, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 2, 2, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 2, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 2, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 2, 2, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 2, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 2, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 2, 2, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 2, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 2, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 2, 2, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 2, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 2, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 2, 3, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 3, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 3, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 2, 3, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 3, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 3, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 2, 3, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 3, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 3, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 2, 3, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 3, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 3, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 2, 4, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 4, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 4, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 2, 4, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 4, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 4, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 2, 4, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 4, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 4, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 2, 4, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 2, 4, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 2, 4, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 2, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 3, 0, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 0, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 0, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 3, 0, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 0, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 0, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 3, 0, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 0, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 0, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 3, 0, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 0, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 0, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 3, 1, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 1, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 1, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 3, 1, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 1, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 1, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 3, 1, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 1, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 1, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 3, 1, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 1, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 1, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 3, 2, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 2, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 2, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 3, 2, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 2, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 2, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 3, 2, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 2, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 2, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 3, 2, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 2, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 2, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 3, 3, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 3, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 3, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 3, 3, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 3, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 3, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 3, 3, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 3, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 3, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 3, 3, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 3, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 3, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 3, 4, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 4, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 4, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 3, 4, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 4, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 4, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 3, 4, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 4, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 4, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 3, 4, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 3, 4, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 3, 4, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 3, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 3
-      {  // Input pad bottom = 4
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 4, 0, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 0, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 0, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 4, 0, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 0, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 0, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 4, 0, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 0, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 0, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 4, 0, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 0, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 0, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 4, 1, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 1, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 1, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 4, 1, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 1, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 1, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 4, 1, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 1, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 1, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 4, 1, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 1, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 1, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 4, 2, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 2, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 2, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 4, 2, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 2, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 2, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 4, 2, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 2, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 2, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 4, 2, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 2, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 2, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 4, 3, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 3, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 3, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 4, 3, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 3, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 3, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 4, 3, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 3, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 3, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 4, 3, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 3, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 3, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 0, 4, 4, 0, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 4, 0, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 4, 0, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 0, 4, 4, 1, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 4, 1, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 4, 1, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 0, 4, 4, 2, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 4, 2, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 4, 2, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 0, 4, 4, 3, 0>,
-            ConvImpl::template process_tile<1, 0, 4, 4, 3, 1>,
-            ConvImpl::template process_tile<1, 0, 4, 4, 3, 2>,
-            ConvImpl::template process_tile<1, 0, 4, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 4
-    },  // Input pad left = 0
-    {  // Input pad left = 1
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 0, 0, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 0, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 0, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 0, 0, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 0, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 0, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 0, 0, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 0, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 0, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 0, 0, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 0, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 0, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 0, 1, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 1, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 1, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 0, 1, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 1, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 1, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 0, 1, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 1, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 1, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 0, 1, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 1, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 1, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 0, 2, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 2, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 2, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 0, 2, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 2, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 2, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 0, 2, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 2, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 2, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 0, 2, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 2, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 2, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 0, 3, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 3, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 3, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 0, 3, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 3, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 3, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 0, 3, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 3, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 3, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 0, 3, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 3, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 3, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 0, 4, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 4, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 4, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 0, 4, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 4, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 4, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 0, 4, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 4, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 4, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 0, 4, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 0, 4, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 0, 4, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 0, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 1, 0, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 0, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 0, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 1, 0, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 0, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 0, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 1, 0, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 0, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 0, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 1, 0, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 0, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 0, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 1, 1, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 1, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 1, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 1, 1, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 1, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 1, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 1, 1, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 1, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 1, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 1, 1, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 1, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 1, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 1, 2, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 2, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 2, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 1, 2, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 2, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 2, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 1, 2, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 2, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 2, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 1, 2, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 2, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 2, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 1, 3, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 3, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 3, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 1, 3, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 3, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 3, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 1, 3, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 3, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 3, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 1, 3, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 3, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 3, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 1, 4, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 4, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 4, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 1, 4, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 4, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 4, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 1, 4, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 4, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 4, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 1, 4, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 1, 4, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 1, 4, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 1, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 2, 0, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 0, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 0, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 2, 0, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 0, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 0, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 2, 0, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 0, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 0, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 2, 0, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 0, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 0, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 2, 1, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 1, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 1, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 2, 1, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 1, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 1, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 2, 1, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 1, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 1, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 2, 1, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 1, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 1, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 2, 2, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 2, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 2, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 2, 2, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 2, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 2, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 2, 2, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 2, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 2, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 2, 2, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 2, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 2, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 2, 3, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 3, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 3, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 2, 3, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 3, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 3, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 2, 3, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 3, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 3, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 2, 3, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 3, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 3, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 2, 4, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 4, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 4, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 2, 4, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 4, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 4, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 2, 4, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 4, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 4, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 2, 4, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 2, 4, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 2, 4, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 2, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 3, 0, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 0, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 0, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 3, 0, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 0, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 0, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 3, 0, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 0, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 0, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 3, 0, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 0, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 0, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 3, 1, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 1, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 1, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 3, 1, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 1, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 1, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 3, 1, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 1, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 1, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 3, 1, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 1, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 1, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 3, 2, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 2, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 2, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 3, 2, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 2, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 2, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 3, 2, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 2, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 2, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 3, 2, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 2, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 2, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 3, 3, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 3, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 3, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 3, 3, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 3, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 3, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 3, 3, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 3, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 3, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 3, 3, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 3, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 3, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 3, 4, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 4, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 4, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 3, 4, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 4, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 4, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 3, 4, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 4, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 4, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 3, 4, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 3, 4, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 3, 4, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 3, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 3
-      {  // Input pad bottom = 4
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 4, 0, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 0, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 0, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 4, 0, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 0, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 0, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 4, 0, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 0, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 0, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 4, 0, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 0, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 0, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 4, 1, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 1, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 1, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 4, 1, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 1, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 1, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 4, 1, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 1, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 1, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 4, 1, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 1, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 1, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 4, 2, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 2, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 2, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 4, 2, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 2, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 2, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 4, 2, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 2, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 2, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 4, 2, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 2, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 2, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 4, 3, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 3, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 3, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 4, 3, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 3, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 3, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 4, 3, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 3, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 3, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 4, 3, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 3, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 3, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            ConvImpl::template process_tile<1, 1, 4, 4, 0, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 4, 0, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 4, 0, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            ConvImpl::template process_tile<1, 1, 4, 4, 1, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 4, 1, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 4, 1, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            ConvImpl::template process_tile<1, 1, 4, 4, 2, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 4, 2, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 4, 2, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            ConvImpl::template process_tile<1, 1, 4, 4, 3, 0>,
-            ConvImpl::template process_tile<1, 1, 4, 4, 3, 1>,
-            ConvImpl::template process_tile<1, 1, 4, 4, 3, 2>,
-            ConvImpl::template process_tile<1, 1, 4, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-      },  // Input pad bottom = 4
-    },  // Input pad left = 1
-  },  // Input pad top = 1
+template <>
+void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
+  const int n_channels,
+  const float* const weights,
+  const int weight_row_stride,
+  const int weight_col_stride,
+  const float* const inptr,
+  const int in_row_stride,
+  const int in_col_stride,
+  float* const outptr,
+  const int out_row_stride,
+  const int out_col_stride,
+  const int, const int, const int, const int, const int, const int
+)
+{
+  constexpr auto inner_tile_rows = DWC::inner_tile_rows;
+  constexpr auto inner_tile_cols = DWC::inner_tile_cols;
+  constexpr auto kernel_rows = DWC::kernel_rows;
+  constexpr auto kernel_cols = DWC::kernel_cols;
+  constexpr auto output_tile_rows = DWC::output_tile_rows;
+  constexpr auto output_tile_cols = DWC::output_tile_cols;
+  constexpr auto stride_rows = DWC::stride_rows;
+  constexpr auto stride_cols = DWC::stride_cols;
+
+  // Extract parameters
+  const int in_pad_top = 0;
+  const int in_pad_left = 0;
+  const int in_pad_bottom = 0;
+  const int in_pad_right = 0;
+  const int out_pad_bottom = 0;
+  const int out_pad_right = 0;
+
+  // Compute valid ranges of the tile
+  const int in_cells_i = inner_tile_rows - in_pad_bottom;
+  const int in_cells_j = inner_tile_cols - in_pad_right;
+  const int out_cells_i = output_tile_rows - out_pad_bottom;
+  const int out_cells_j = output_tile_cols - out_pad_right;
+
+  // Copy pointers
+  const float *uptr0 = inptr;
+  const float *wptr0 = weights;
+  float *vptr0 = outptr;
+  const bool same_strides = (
+    weight_col_stride == in_col_stride &&
+    weight_col_stride == out_col_stride
+  );
+
+  int channels_remaining = n_channels;
+  if (channels_remaining >= 4 && same_strides)
+  {
+    int c4_rem = channels_remaining / 4;
+    channels_remaining %= 4;
+    const int prefetch_depth = 8;
+
+    asm volatile (
+      "qW22 .req q0\n" "vW22 .req v0\n"
+      "qU64 .req q1\n" "qU35 .req q1\n" "qV41 .req q1\n"
+      "vU64 .req v1\n" "vU35 .req v1\n" "vV41 .req v1\n"
+      "qU34 .req q2\n" "qU21 .req q2\n" "qV43 .req q2\n"
+      "vU34 .req v2\n" "vU21 .req v2\n" "vV43 .req v2\n"
+      "qW21 .req q3\n" "vW21 .req v3\n"
+      "qU24 .req q4\n" "qU54 .req q4\n" "qV31 .req q4\n"
+      "vU24 .req v4\n" "vU54 .req v4\n" "vV31 .req v4\n"
+      "qV12 .req q5\n" "qU61 .req q5\n" "vV12 .req v5\n" "vU61 .req v5\n"
+      "qU26 .req q6\n" "qV32 .req q6\n" "vU26 .req v6\n" "vV32 .req v6\n"
+      "qU36 .req q7\n" "qU51 .req q7\n" "qU66 .req q7\n" "qU12 .req q7\n"
+      "vU36 .req v7\n" "vU51 .req v7\n" "vU66 .req v7\n" "vU12 .req v7\n"
+      "qV14 .req q8\n" "qV11 .req q8\n" "qU65 .req q8\n"
+      "vV14 .req v8\n" "vV11 .req v8\n" "vU65 .req v8\n"
+      "qU15 .req q9\n" "qU22 .req q9\n" "qU45 .req q9\n"
+      "vU15 .req v9\n" "vU22 .req v9\n" "vU45 .req v9\n"
+      "qV22 .req q10\n" "qU14 .req q10\n" "vV22 .req v10\n" "vU14 .req v10\n"
+      "qU44 .req q11\n" "qU43 .req q11\n" "qU11 .req q11\n"
+      "vU44 .req v11\n" "vU43 .req v11\n" "vU11 .req v11\n"
+      "qV24 .req q12\n" "qV42 .req q12\n" "vV24 .req v12\n" "vV42 .req v12\n"
+      "qW31 .req q13\n" "vW31 .req v13\n" "qW13 .req q14\n" "vW13 .req v14\n"
+      "qU33 .req q15\n" "qU62 .req q15\n" "qU25 .req q15\n" "qU56 .req q15\n"
+      "vU33 .req v15\n" "vU62 .req v15\n" "vU25 .req v15\n" "vU56 .req v15\n"
+      "qW33 .req q16\n" "vW33 .req v16\n"
+      "qU42 .req q17\n" "qU16 .req q17\n" "qV44 .req q17\n"
+      "vU42 .req v17\n" "vU16 .req v17\n" "vV44 .req v17\n"
+      "qU63 .req q18\n" "qU31 .req q18\n" "qV34 .req q18\n"
+      "vU63 .req v18\n" "vU31 .req v18\n" "vV34 .req v18\n"
+      "qW11 .req q19\n" "vW11 .req v19\n" "qU41 .req q20\n" "qV13 .req q20\n"
+      "vU41 .req v20\n" "vV13 .req v20\n" "qV33 .req q21\n" "vV33 .req v21\n"
+      "qU46 .req q22\n" "qU32 .req q22\n" "qU13 .req q22\n"
+      "vU46 .req v22\n" "vU32 .req v22\n" "vU13 .req v22\n" "qW23 .req q23\n"
+      "vW23 .req v23\n" "qV23 .req q24\n" "vV23 .req v24\n"
+      "qV21 .req q25\n" "qU55 .req q25\n" "vV21 .req v25\n" "vU55 .req v25\n"
+      "qW12 .req q26\n" "vW12 .req v26\n" "qW32 .req q27\n" "vW32 .req v27\n"
+      "qU23 .req q28\n" "qU52 .req q28\n"
+      "vU23 .req v28\n" "vU52 .req v28\n" "qU53 .req q29\n" "vU53 .req v29\n"
+
+      "uptr1 .req x0\n"
+      "uptr2 .req x1\n"
+      "uptr3 .req x2\n"
+      "uptr4 .req x3\n"
+      "uptr5 .req x4\n"
+
+      "vptr1 .req x5\n"
+      "vptr2 .req x6\n"
+      "vptr3 .req x7\n"
+
+      "wptr1 .req x8\n"
+      "wptr2 .req x9\n"
+
+      // Prepare pointers and strides
+      "add uptr1, %x[uptr0], %x[u_row_stride]\n"
+      "add uptr2,    uptr1 , %x[u_row_stride]\n"
+      "add uptr3,    uptr2 , %x[u_row_stride]\n"
+      "add uptr4,    uptr3 , %x[u_row_stride]\n"
+      "add uptr5,    uptr4 , %x[u_row_stride]\n"
+
+      "add vptr1, %x[vptr0], %x[v_row_stride]\n"
+      "add vptr2,    vptr1 , %x[v_row_stride]\n"
+      "add vptr3,    vptr2 , %x[v_row_stride]\n"
+
+      "add wptr1, %x[wptr0], %x[w_row_stride]\n"
+      "add wptr2,    wptr1 , %x[w_row_stride]\n"
+
+      // Load initial operands
+      "ldr qU16, [%x[uptr0], %x[uvw_col_stride5]]\n"
+      "ldr qW13, [%x[wptr0], %x[uvw_col_stride2]]\n"
+      "subs %x[c4_rem], %x[c4_rem], #1\n"
+      "ldr qU15, [%x[uptr0], %x[uvw_col_stride4]]\n"
+      "ldr qW23, [wptr1, %x[uvw_col_stride2]]\n"
+      "ldr qU14, [%x[uptr0], %x[uvw_col_stride3]]\n"
+      "ldr qW33, [wptr2, %x[uvw_col_stride2]]\n"
+      "ldr qU26, [uptr1, %x[uvw_col_stride5]]\n"
+      "ldr qW12, [%x[wptr0], %x[uvw_col_stride1]]\n"
+      "ldr qU25, [uptr1, %x[uvw_col_stride4]]\n"
+      "ldr qW22, [wptr1, %x[uvw_col_stride1]]\n"
+      "ldr qU36, [uptr2, %x[uvw_col_stride5]]\n"
+      "ldr qW32, [wptr2, %x[uvw_col_stride1]]\n"
+      "ldr qW11, [%x[wptr0]], #0x10\n"
+      "fmul vV14.4s, vU16.4s, vW13.4s\n"
+      "ldr qU24, [uptr1, %x[uvw_col_stride3]]\n"
+      "fmul vV13.4s, vU15.4s, vW13.4s\n"
+      "ldr qW31, [wptr2], #0x10\n"
+      "fmla vV14.4s, vU15.4s, vW12.4s\n"
+      "ldr qW21, [wptr1], #0x10\n"
+      "fmul vV12.4s, vU14.4s, vW13.4s\n"
+      "ldr qU34, [uptr2, %x[uvw_col_stride3]]\n"
+      "fmla vV13.4s, vU14.4s, vW12.4s\n"
+      "ldr qU46, [uptr3, %x[uvw_col_stride5]]\n"
+      "fmla vV14.4s, vU14.4s, vW11.4s\n"
+      "ldr qU45, [uptr3, %x[uvw_col_stride4]]\n"
+      "fmla vV14.4s, vU26.4s, vW23.4s\n"
+      "ldr qU35, [uptr2, %x[uvw_col_stride4]]\n"
+      "fmul vV24.4s, vU26.4s, vW13.4s\n"
+      "ldr qU44, [uptr3, %x[uvw_col_stride3]]\n"
+      "fmla vV13.4s, vU25.4s, vW23.4s\n"
+      "beq 2f\n"  // Single iteration only
+
+      "1:"  // Loop body
+        "fmla vV14.4s, vU25.4s, vW22.4s\n"
+        "prfm pldl1keep, [%x[wptr0], %[prftch]]\n"
+        "fmul vV23.4s, vU25.4s, vW13.4s\n"
+        "prfm pldl1keep, [%x[wptr0], %x[prftch_uvw_col_stride1]]\n"
+        "fmla vV24.4s, vU25.4s, vW12.4s\n"
+        "ldr qU56, [uptr4, %x[uvw_col_stride5]]\n"
+        "fmla vV12.4s, vU24.4s, vW23.4s\n"
+        "prfm pldl1keep, [%x[wptr0], %x[prftch_uvw_col_stride2] ]\n"
+        "fmla vV13.4s, vU24.4s, vW22.4s\n"
+        "prfm pldl1keep, [   wptr1 , %[prftch]]\n"
+        "fmla vV14.4s, vU24.4s, vW21.4s\n"
+        "prfm pldl1keep, [   wptr1 , %x[prftch_uvw_col_stride1]]\n"
+        "fmul vV22.4s, vU24.4s, vW13.4s\n"
+        "prfm pldl1keep, [   wptr1 , %x[prftch_uvw_col_stride2] ]\n"
+        "fmla vV23.4s, vU24.4s, vW12.4s\n"
+        "prfm pldl1keep, [   wptr2 , %x[prftch]]\n"
+        "fmla vV24.4s, vU24.4s, vW11.4s\n"
+        "ldr qU55, [uptr4, %x[uvw_col_stride4]]\n"
+        "fmla vV14.4s, vU36.4s, vW33.4s\n"
+        "prfm pldl1keep, [   wptr2 , %x[prftch_uvw_col_stride1]]\n"
+        "fmla vV24.4s, vU36.4s, vW23.4s\n"
+        "prfm pldl1keep, [   wptr2 , %x[prftch_uvw_col_stride2] ]\n"
+        "fmul vV34.4s, vU36.4s, vW13.4s\n"
+        "ldr qU54, [uptr4, %x[uvw_col_stride3]]\n"
+        "fmla vV13.4s, vU35.4s, vW33.4s\n"
+        "prfm pldl1keep, [   uptr2 , %x[prftch_uvw_col_stride1]]\n"
+        "fmla vV14.4s, vU35.4s, vW32.4s\n"
+        "prfm pldl1keep, [   uptr2 , %x[prftch_uvw_col_stride2] ]\n"
+        "fmla vV23.4s, vU35.4s, vW23.4s\n"
+        "prfm pldl1keep, [   uptr2 , %x[prftch_uvw_col_stride3] ]\n"
+        "fmla vV24.4s, vU35.4s, vW22.4s\n"
+        "prfm pldl1keep, [   uptr2 , %x[prftch_uvw_col_stride4] ]\n"
+        "fmul vV33.4s, vU35.4s, vW13.4s\n"
+        "prfm pldl1keep, [   uptr2 , %x[prftch_uvw_col_stride5] ]\n"
+        "fmla vV34.4s, vU35.4s, vW12.4s\n"
+        "ldr qU66, [uptr5, %x[uvw_col_stride5]]\n"
+        "fmla vV12.4s, vU34.4s, vW33.4s\n"
+        "prfm pldl1keep, [   uptr3 , %[prftch]]\n"
+        "fmla vV13.4s, vU34.4s, vW32.4s\n"
+        "prfm pldl1keep, [   uptr3 , %x[prftch_uvw_col_stride1]]\n"
+        "fmla vV14.4s, vU34.4s, vW31.4s\n"
+        "str qV14, [%x[vptr0], %x[uvw_col_stride3]]\n"
+        "fmla vV22.4s, vU34.4s, vW23.4s\n"
+        "prfm pldl1keep, [   uptr3 , %x[prftch_uvw_col_stride2] ]\n"
+        "fmla vV23.4s, vU34.4s, vW22.4s\n"
+        "prfm pldl1keep, [   uptr3 , %x[prftch_uvw_col_stride3] ]\n"
+        "fmla vV24.4s, vU34.4s, vW21.4s\n"
+        "prfm pldl1keep, [   uptr3 , %x[prftch_uvw_col_stride4] ]\n"
+        "fmul vV32.4s, vU34.4s, vW13.4s\n"
+        "prfm pldl1keep, [   uptr3 , %x[prftch_uvw_col_stride5] ]\n"
+        "fmla vV33.4s, vU34.4s, vW12.4s\n"
+        "prfm pldl1keep, [   uptr4 , %[prftch]]\n"
+        "fmla vV34.4s, vU34.4s, vW11.4s\n"
+        "ldr qU65, [uptr5, %x[uvw_col_stride4]]\n"
+        "fmla vV24.4s, vU46.4s, vW33.4s\n"
+        "prfm pldl1keep, [   uptr4 , %x[prftch_uvw_col_stride1]]\n"
+        "fmla vV34.4s, vU46.4s, vW23.4s\n"
+        "prfm pldl1keep, [   uptr4 , %x[prftch_uvw_col_stride2] ]\n"
+        "fmul vV44.4s, vU46.4s, vW13.4s\n"
+        "ldr qU64, [uptr5, %x[uvw_col_stride3]]\n"
+        "fmla vV23.4s, vU45.4s, vW33.4s\n"
+        "prfm pldl1keep, [   uptr4 , %x[prftch_uvw_col_stride3] ]\n"
+        "fmla vV24.4s, vU45.4s, vW32.4s\n"
+        "prfm pldl1keep, [   uptr4 , %x[prftch_uvw_col_stride4] ]\n"
+        "fmla vV33.4s, vU45.4s, vW23.4s\n"
+        "prfm pldl1keep, [   uptr4 , %x[prftch_uvw_col_stride5] ]\n"
+        "fmla vV34.4s, vU45.4s, vW22.4s\n"
+        "prfm pldl1keep, [   uptr5 , %[prftch]]\n"
+        "fmul vV43.4s, vU45.4s, vW13.4s\n"
+        "prfm pldl1keep, [   uptr5 , %x[prftch_uvw_col_stride1]]\n"
+        "fmla vV44.4s, vU45.4s, vW12.4s\n"
+        "ldr qU13, [%x[uptr0], %x[uvw_col_stride2]]\n"
+        "fmla vV22.4s, vU44.4s, vW33.4s\n"
+        "prfm pldl1keep, [   uptr5 , %x[prftch_uvw_col_stride2] ]\n"
+        "fmla vV23.4s, vU44.4s, vW32.4s\n"
+        "prfm pldl1keep, [   uptr5 , %x[prftch_uvw_col_stride3] ]\n"
+        "fmla vV24.4s, vU44.4s, vW31.4s\n"
+        "str qV24, [vptr1, %x[uvw_col_stride3]]\n"
+        "fmla vV32.4s, vU44.4s, vW23.4s\n"
+        "prfm pldl1keep, [   uptr5 , %x[prftch_uvw_col_stride4] ]\n"
+        "fmla vV33.4s, vU44.4s, vW22.4s\n"
+        "prfm pldl1keep, [   uptr5 , %x[prftch_uvw_col_stride5] ]\n"
+        "fmla vV34.4s, vU44.4s, vW21.4s\n"
+        "prfm pstl1keep, [%x[vptr0], %[prftch]]\n"
+        "fmul vV42.4s, vU44.4s, vW13.4s\n"
+        "prfm pstl1keep, [%x[vptr0], %x[prftch_uvw_col_stride1]]\n"
+        "fmla vV43.4s, vU44.4s, vW12.4s\n"
+        "prfm pstl1keep, [%x[vptr0], %x[prftch_uvw_col_stride2] ]\n"
+        "fmla vV44.4s, vU44.4s, vW11.4s\n"
+        "ldr qU23, [uptr1, %x[uvw_col_stride2]]\n"
+        "fmla vV34.4s, vU56.4s, vW33.4s\n"
+        "prfm pstl1keep, [%x[vptr0], %x[prftch_uvw_col_stride3] ]\n"
+        "fmla vV44.4s, vU56.4s, vW23.4s\n"
+        "ldr qU33, [uptr2, %x[uvw_col_stride2]]\n"
+        "fmla vV33.4s, vU55.4s, vW33.4s\n"
+        "prfm pstl1keep, [   vptr1 , %[prftch]]\n"
+        "fmla vV34.4s, vU55.4s, vW32.4s\n"
+        "prfm pstl1keep, [   vptr1 , %x[prftch_uvw_col_stride1]]\n"
+        "fmla vV43.4s, vU55.4s, vW23.4s\n"
+        "prfm pstl1keep, [   vptr1 , %x[prftch_uvw_col_stride2] ]\n"
+        "fmla vV44.4s, vU55.4s, vW22.4s\n"
+        "ldr qU43, [uptr3, %x[uvw_col_stride2]]\n"
+        "fmla vV32.4s, vU54.4s, vW33.4s\n"
+        "prfm pstl1keep, [   vptr1 , %x[prftch_uvw_col_stride3] ]\n"
+        "fmla vV33.4s, vU54.4s, vW32.4s\n"
+        "prfm pstl1keep, [   vptr2 , %[prftch]]\n"
+        "fmla vV34.4s, vU54.4s, vW31.4s\n"
+        "str qV34, [vptr2, %x[uvw_col_stride3]]\n"
+        "fmla vV42.4s, vU54.4s, vW23.4s\n"
+        "prfm pstl1keep, [   vptr2 , %x[prftch_uvw_col_stride1]]\n"
+        "fmla vV43.4s, vU54.4s, vW22.4s\n"
+        "prfm pstl1keep, [   vptr2 , %x[prftch_uvw_col_stride2] ]\n"
+        "fmla vV44.4s, vU54.4s, vW21.4s\n"
+        "ldr qU53, [uptr4, %x[uvw_col_stride2]]\n"
+        "fmla vV44.4s, vU66.4s, vW33.4s\n"
+        "ldr qU63, [uptr5, %x[uvw_col_stride2]]\n"
+        "fmla vV43.4s, vU65.4s, vW33.4s\n"
+        "prfm pstl1keep, [   vptr2 , %x[prftch_uvw_col_stride3] ]\n"
+        "fmla vV44.4s, vU65.4s, vW32.4s\n"
+        "ldr qU12, [%x[uptr0], %x[uvw_col_stride1]]\n"
+        "fmla vV42.4s, vU64.4s, vW33.4s\n"
+        "prfm pstl1keep, [   vptr3 , %[prftch]]\n"
+        "fmla vV43.4s, vU64.4s, vW32.4s\n"
+        "prfm pstl1keep, [   vptr3 , %x[prftch_uvw_col_stride1]]\n"
+        "fmla vV44.4s, vU64.4s, vW31.4s\n"
+        "str qV44, [vptr3, %x[uvw_col_stride3]]\n"
+        "fmul vV11.4s, vU13.4s, vW13.4s\n"
+        "ldr qU22, [uptr1, %x[uvw_col_stride1]]\n"
+        "fmla vV12.4s, vU13.4s, vW12.4s\n"
+        "prfm pstl1keep, [   vptr3 , %x[prftch_uvw_col_stride2] ]\n"
+        "fmla vV13.4s, vU13.4s, vW11.4s\n"
+        "ldr qU32, [uptr2, %x[uvw_col_stride1]]\n"
+        "fmla vV11.4s, vU23.4s, vW23.4s\n"
+        "prfm pstl1keep, [   vptr3 , %x[prftch_uvw_col_stride3] ]\n"
+        "fmla vV12.4s, vU23.4s, vW22.4s\n"
+        "fmla vV13.4s, vU23.4s, vW21.4s\n"
+        "fmul vV21.4s, vU23.4s, vW13.4s\n"
+        "fmla vV22.4s, vU23.4s, vW12.4s\n"
+        "fmla vV23.4s, vU23.4s, vW11.4s\n"
+        "ldr qU42, [uptr3, %x[uvw_col_stride1]]\n"
+        "fmla vV11.4s, vU33.4s, vW33.4s\n"
+        "fmla vV12.4s, vU33.4s, vW32.4s\n"
+        "fmla vV13.4s, vU33.4s, vW31.4s\n"
+        "str qV13, [%x[vptr0], %x[uvw_col_stride2]]\n"
+        "fmla vV21.4s, vU33.4s, vW23.4s\n"
+        "fmla vV22.4s, vU33.4s, vW22.4s\n"
+        "fmla vV23.4s, vU33.4s, vW21.4s\n"
+        "fmul vV31.4s, vU33.4s, vW13.4s\n"
+        "fmla vV32.4s, vU33.4s, vW12.4s\n"
+        "fmla vV33.4s, vU33.4s, vW11.4s\n"
+        "ldr qU52, [uptr4, %x[uvw_col_stride1]]\n"
+        "fmla vV21.4s, vU43.4s, vW33.4s\n"
+        "fmla vV22.4s, vU43.4s, vW32.4s\n"
+        "fmla vV23.4s, vU43.4s, vW31.4s\n"
+        "str qV23, [vptr1, %x[uvw_col_stride2]]\n"
+        "fmla vV31.4s, vU43.4s, vW23.4s\n"
+        "fmla vV32.4s, vU43.4s, vW22.4s\n"
+        "fmla vV33.4s, vU43.4s, vW21.4s\n"
+        "fmul vV41.4s, vU43.4s, vW13.4s\n"
+        "ldr qW13, [%x[wptr0], %x[uvw_col_stride2]]\n"
+        "fmla vV42.4s, vU43.4s, vW12.4s\n"
+        "fmla vV43.4s, vU43.4s, vW11.4s\n"
+        "ldr qU62, [uptr5, %x[uvw_col_stride1]]\n"
+        "fmla vV31.4s, vU53.4s, vW33.4s\n"
+        "fmla vV32.4s, vU53.4s, vW32.4s\n"
+        "fmla vV33.4s, vU53.4s, vW31.4s\n"
+        "str qV33, [vptr2, %x[uvw_col_stride2]]\n"
+        "fmla vV41.4s, vU53.4s, vW23.4s\n"
+        "ldr qW23, [wptr1, %x[uvw_col_stride2]]\n"
+        "fmla vV42.4s, vU53.4s, vW22.4s\n"
+        "fmla vV43.4s, vU53.4s, vW21.4s\n"
+        "ldr qU11, [%x[uptr0]], #0x10\n"
+        "fmla vV41.4s, vU63.4s, vW33.4s\n"
+        "ldr qW33, [wptr2, %x[uvw_col_stride2]]\n"
+        "fmla vV42.4s, vU63.4s, vW32.4s\n"
+        "prfm pldl1keep, [%x[uptr0], %[prftch]]\n"
+        "fmla vV43.4s, vU63.4s, vW31.4s\n"
+        "str qV43, [vptr3, %x[uvw_col_stride2]]\n"
+        "fmla vV11.4s, vU12.4s, vW12.4s\n"
+        "ldr qU21, [uptr1], #0x10\n"
+        "fmla vV12.4s, vU12.4s, vW11.4s\n"
+        "ldr qU31, [uptr2], #0x10\n"
+        "fmla vV11.4s, vU22.4s, vW22.4s\n"
+        "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride1]]\n"
+        "fmla vV12.4s, vU22.4s, vW21.4s\n"
+        "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride2] ]\n"
+        "fmla vV21.4s, vU22.4s, vW12.4s\n"
+        "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride3] ]\n"
+        "fmla vV22.4s, vU22.4s, vW11.4s\n"
+        "ldr qU41, [uptr3], #0x10\n"
+        "fmla vV11.4s, vU32.4s, vW32.4s\n"
+        "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride4] ]\n"
+        "fmla vV12.4s, vU32.4s, vW31.4s\n"
+        "str qV12, [%x[vptr0], %x[uvw_col_stride1]]\n"
+        "fmla vV21.4s, vU32.4s, vW22.4s\n"
+        "prfm pldl1keep, [%x[uptr0], %x[prftch_uvw_col_stride5] ]\n"
+        "fmla vV22.4s, vU32.4s, vW21.4s\n"
+        "prfm pldl1keep, [   uptr1 , %[prftch]]\n"
+        "fmla vV31.4s, vU32.4s, vW12.4s\n"
+        "prfm pldl1keep, [   uptr1 , %x[prftch_uvw_col_stride1]]\n"
+        "fmla vV32.4s, vU32.4s, vW11.4s\n"
+        "ldr qU51, [uptr4], #0x10\n"
+        "fmla vV21.4s, vU42.4s, vW32.4s\n"
+        "prfm pldl1keep, [   uptr1 , %x[prftch_uvw_col_stride2] ]\n"
+        "fmla vV22.4s, vU42.4s, vW31.4s\n"
+        "str qV22, [vptr1, %x[uvw_col_stride1]]\n"
+        "fmla vV31.4s, vU42.4s, vW22.4s\n"
+        "prfm pldl1keep, [   uptr1 , %x[prftch_uvw_col_stride3] ]\n"
+        "fmla vV32.4s, vU42.4s, vW21.4s\n"
+        "subs %x[c4_rem], %x[c4_rem], #1\n"
+        "fmla vV41.4s, vU42.4s, vW12.4s\n"
+        "ldr qW12, [%x[wptr0], %x[uvw_col_stride1]]\n"
+        "fmla vV42.4s, vU42.4s, vW11.4s\n"
+        "ldr qU61, [uptr5], #0x10\n"
+        "fmla vV31.4s, vU52.4s, vW32.4s\n"
+        "prfm pldl1keep, [   uptr1 , %x[prftch_uvw_col_stride4] ]\n"
+        "fmla vV32.4s, vU52.4s, vW31.4s\n"
+        "str qV32, [vptr2, %x[uvw_col_stride1]]\n"
+        "fmla vV41.4s, vU52.4s, vW22.4s\n"
+        "ldr qW22, [wptr1, %x[uvw_col_stride1]]\n"
+        "fmla vV42.4s, vU52.4s, vW21.4s\n"
+        "ldr qU16, [%x[uptr0], %x[uvw_col_stride5]]\n"
+        "fmla vV41.4s, vU62.4s, vW32.4s\n"
+        "ldr qW32, [wptr2, %x[uvw_col_stride1]]\n"
+        "fmla vV42.4s, vU62.4s, vW31.4s\n"
+        "str qV42, [vptr3, %x[uvw_col_stride1]]\n"
+        "fmla vV11.4s, vU11.4s, vW11.4s\n"
+        "ldr qU15, [%x[uptr0], %x[uvw_col_stride4]]\n"
+        "fmla vV11.4s, vU21.4s, vW21.4s\n"
+        "ldr qU14, [%x[uptr0], %x[uvw_col_stride3]]\n"
+        "fmla vV21.4s, vU21.4s, vW11.4s\n"
+        "ldr qU26, [uptr1, %x[uvw_col_stride5]]\n"
+        "fmla vV11.4s, vU31.4s, vW31.4s\n"
+        "str qV11, [%x[vptr0]], #0x10\n"
+        "fmla vV21.4s, vU31.4s, vW21.4s\n"
+        "prfm pldl1keep, [   uptr1 , %x[prftch_uvw_col_stride5] ]\n"
+        "fmla vV31.4s, vU31.4s, vW11.4s\n"
+        "ldr qU25, [uptr1, %x[uvw_col_stride4]]\n"
+        "fmla vV21.4s, vU41.4s, vW31.4s\n"
+        "str qV21, [vptr1], #0x10\n"
+        "fmla vV31.4s, vU41.4s, vW21.4s\n"
+        "prfm pldl1keep, [   uptr2 , %[prftch]]\n"
+        "fmla vV41.4s, vU41.4s, vW11.4s\n"
+        "ldr qW11, [%x[wptr0]], #0x10\n"
+        "fmla vV31.4s, vU51.4s, vW31.4s\n"
+        "str qV31, [vptr2], #0x10\n"
+        "fmla vV41.4s, vU51.4s, vW21.4s\n"
+        "ldr qU36, [uptr2, %x[uvw_col_stride5]]\n"
+        "fmla vV41.4s, vU61.4s, vW31.4s\n"
+        "str qV41, [vptr3], #0x10\n"
+        "fmul vV14.4s, vU16.4s, vW13.4s\n"
+        "ldr qU24, [uptr1, %x[uvw_col_stride3]]\n"
+        "fmul vV13.4s, vU15.4s, vW13.4s\n"
+        "ldr qW31, [wptr2], #0x10\n"
+        "fmla vV14.4s, vU15.4s, vW12.4s\n"
+        "ldr qW21, [wptr1], #0x10\n"
+        "fmul vV12.4s, vU14.4s, vW13.4s\n"
+        "ldr qU34, [uptr2, %x[uvw_col_stride3]]\n"
+        "fmla vV13.4s, vU14.4s, vW12.4s\n"
+        "ldr qU46, [uptr3, %x[uvw_col_stride5]]\n"
+        "fmla vV14.4s, vU14.4s, vW11.4s\n"
+        "ldr qU45, [uptr3, %x[uvw_col_stride4]]\n"
+        "fmla vV14.4s, vU26.4s, vW23.4s\n"
+        "ldr qU35, [uptr2, %x[uvw_col_stride4]]\n"
+        "fmul vV24.4s, vU26.4s, vW13.4s\n"
+        "ldr qU44, [uptr3, %x[uvw_col_stride3]]\n"
+        "fmla vV13.4s, vU25.4s, vW23.4s\n"
+        "bne 1b\n"
+
+      "2:"  // Final iteration
+        "fmla vV14.4s, vU25.4s, vW22.4s\n"
+        "fmul vV23.4s, vU25.4s, vW13.4s\n"
+        "fmla vV24.4s, vU25.4s, vW12.4s\n"
+        "ldr qU56, [uptr4, %x[uvw_col_stride5]]\n"
+        "fmla vV12.4s, vU24.4s, vW23.4s\n"
+        "fmla vV13.4s, vU24.4s, vW22.4s\n"
+        "fmla vV14.4s, vU24.4s, vW21.4s\n"
+        "fmul vV22.4s, vU24.4s, vW13.4s\n"
+        "fmla vV23.4s, vU24.4s, vW12.4s\n"
+        "fmla vV24.4s, vU24.4s, vW11.4s\n"
+        "ldr qU55, [uptr4, %x[uvw_col_stride4]]\n"
+        "fmla vV14.4s, vU36.4s, vW33.4s\n"
+        "fmla vV24.4s, vU36.4s, vW23.4s\n"
+        "fmul vV34.4s, vU36.4s, vW13.4s\n"
+        "ldr qU54, [uptr4, %x[uvw_col_stride3]]\n"
+        "fmla vV13.4s, vU35.4s, vW33.4s\n"
+        "fmla vV14.4s, vU35.4s, vW32.4s\n"
+        "fmla vV23.4s, vU35.4s, vW23.4s\n"
+        "fmla vV24.4s, vU35.4s, vW22.4s\n"
+        "fmul vV33.4s, vU35.4s, vW13.4s\n"
+        "fmla vV34.4s, vU35.4s, vW12.4s\n"
+        "ldr qU66, [uptr5, %x[uvw_col_stride5]]\n"
+        "fmla vV12.4s, vU34.4s, vW33.4s\n"
+        "fmla vV13.4s, vU34.4s, vW32.4s\n"
+        "fmla vV14.4s, vU34.4s, vW31.4s\n"
+        "str qV14, [%x[vptr0], %x[uvw_col_stride3]]\n"
+        "fmla vV22.4s, vU34.4s, vW23.4s\n"
+        "fmla vV23.4s, vU34.4s, vW22.4s\n"
+        "fmla vV24.4s, vU34.4s, vW21.4s\n"
+        "fmul vV32.4s, vU34.4s, vW13.4s\n"
+        "fmla vV33.4s, vU34.4s, vW12.4s\n"
+        "fmla vV34.4s, vU34.4s, vW11.4s\n"
+        "ldr qU65, [uptr5, %x[uvw_col_stride4]]\n"
+        "fmla vV24.4s, vU46.4s, vW33.4s\n"
+        "fmla vV34.4s, vU46.4s, vW23.4s\n"
+        "fmul vV44.4s, vU46.4s, vW13.4s\n"
+        "ldr qU64, [uptr5, %x[uvw_col_stride3]]\n"
+        "fmla vV23.4s, vU45.4s, vW33.4s\n"
+        "fmla vV24.4s, vU45.4s, vW32.4s\n"
+        "fmla vV33.4s, vU45.4s, vW23.4s\n"
+        "fmla vV34.4s, vU45.4s, vW22.4s\n"
+        "fmul vV43.4s, vU45.4s, vW13.4s\n"
+        "fmla vV44.4s, vU45.4s, vW12.4s\n"
+        "ldr qU13, [%x[uptr0], %x[uvw_col_stride2]]\n"
+        "fmla vV22.4s, vU44.4s, vW33.4s\n"
+        "fmla vV23.4s, vU44.4s, vW32.4s\n"
+        "fmla vV24.4s, vU44.4s, vW31.4s\n"
+        "str qV24, [vptr1, %x[uvw_col_stride3]]\n"
+        "fmla vV32.4s, vU44.4s, vW23.4s\n"
+        "fmla vV33.4s, vU44.4s, vW22.4s\n"
+        "fmla vV34.4s, vU44.4s, vW21.4s\n"
+        "fmul vV42.4s, vU44.4s, vW13.4s\n"
+        "fmla vV43.4s, vU44.4s, vW12.4s\n"
+        "fmla vV44.4s, vU44.4s, vW11.4s\n"
+        "ldr qU23, [uptr1, %x[uvw_col_stride2]]\n"
+        "fmla vV34.4s, vU56.4s, vW33.4s\n"
+        "fmla vV44.4s, vU56.4s, vW23.4s\n"
+        "ldr qU33, [uptr2, %x[uvw_col_stride2]]\n"
+        "fmla vV33.4s, vU55.4s, vW33.4s\n"
+        "fmla vV34.4s, vU55.4s, vW32.4s\n"
+        "fmla vV43.4s, vU55.4s, vW23.4s\n"
+        "fmla vV44.4s, vU55.4s, vW22.4s\n"
+        "ldr qU43, [uptr3, %x[uvw_col_stride2]]\n"
+        "fmla vV32.4s, vU54.4s, vW33.4s\n"
+        "fmla vV33.4s, vU54.4s, vW32.4s\n"
+        "fmla vV34.4s, vU54.4s, vW31.4s\n"
+        "str qV34, [vptr2, %x[uvw_col_stride3]]\n"
+        "fmla vV42.4s, vU54.4s, vW23.4s\n"
+        "fmla vV43.4s, vU54.4s, vW22.4s\n"
+        "fmla vV44.4s, vU54.4s, vW21.4s\n"
+        "ldr qU53, [uptr4, %x[uvw_col_stride2]]\n"
+        "fmla vV44.4s, vU66.4s, vW33.4s\n"
+        "ldr qU63, [uptr5, %x[uvw_col_stride2]]\n"
+        "fmla vV43.4s, vU65.4s, vW33.4s\n"
+        "fmla vV44.4s, vU65.4s, vW32.4s\n"
+        "ldr qU12, [%x[uptr0], %x[uvw_col_stride1]]\n"
+        "fmla vV42.4s, vU64.4s, vW33.4s\n"
+        "fmla vV43.4s, vU64.4s, vW32.4s\n"
+        "fmla vV44.4s, vU64.4s, vW31.4s\n"
+        "str qV44, [vptr3, %x[uvw_col_stride3]]\n"
+        "fmul vV11.4s, vU13.4s, vW13.4s\n"
+        "ldr qU22, [uptr1, %x[uvw_col_stride1]]\n"
+        "fmla vV12.4s, vU13.4s, vW12.4s\n"
+        "fmla vV13.4s, vU13.4s, vW11.4s\n"
+        "ldr qU32, [uptr2, %x[uvw_col_stride1]]\n"
+        "fmla vV11.4s, vU23.4s, vW23.4s\n"
+        "fmla vV12.4s, vU23.4s, vW22.4s\n"
+        "fmla vV13.4s, vU23.4s, vW21.4s\n"
+        "fmul vV21.4s, vU23.4s, vW13.4s\n"
+        "fmla vV22.4s, vU23.4s, vW12.4s\n"
+        "fmla vV23.4s, vU23.4s, vW11.4s\n"
+        "ldr qU42, [uptr3, %x[uvw_col_stride1]]\n"
+        "fmla vV11.4s, vU33.4s, vW33.4s\n"
+        "fmla vV12.4s, vU33.4s, vW32.4s\n"
+        "fmla vV13.4s, vU33.4s, vW31.4s\n"
+        "str qV13, [%x[vptr0], %x[uvw_col_stride2]]\n"
+        "fmla vV21.4s, vU33.4s, vW23.4s\n"
+        "fmla vV22.4s, vU33.4s, vW22.4s\n"
+        "fmla vV23.4s, vU33.4s, vW21.4s\n"
+        "fmul vV31.4s, vU33.4s, vW13.4s\n"
+        "fmla vV32.4s, vU33.4s, vW12.4s\n"
+        "fmla vV33.4s, vU33.4s, vW11.4s\n"
+        "ldr qU52, [uptr4, %x[uvw_col_stride1]]\n"
+        "fmla vV21.4s, vU43.4s, vW33.4s\n"
+        "fmla vV22.4s, vU43.4s, vW32.4s\n"
+        "fmla vV23.4s, vU43.4s, vW31.4s\n"
+        "str qV23, [vptr1, %x[uvw_col_stride2]]\n"
+        "fmla vV31.4s, vU43.4s, vW23.4s\n"
+        "fmla vV32.4s, vU43.4s, vW22.4s\n"
+        "fmla vV33.4s, vU43.4s, vW21.4s\n"
+        "fmul vV41.4s, vU43.4s, vW13.4s\n"
+        "fmla vV42.4s, vU43.4s, vW12.4s\n"
+        "fmla vV43.4s, vU43.4s, vW11.4s\n"
+        "ldr qU62, [uptr5, %x[uvw_col_stride1]]\n"
+        "fmla vV31.4s, vU53.4s, vW33.4s\n"
+        "fmla vV32.4s, vU53.4s, vW32.4s\n"
+        "fmla vV33.4s, vU53.4s, vW31.4s\n"
+        "str qV33, [vptr2, %x[uvw_col_stride2]]\n"
+        "fmla vV41.4s, vU53.4s, vW23.4s\n"
+        "fmla vV42.4s, vU53.4s, vW22.4s\n"
+        "fmla vV43.4s, vU53.4s, vW21.4s\n"
+        "ldr qU11, [%x[uptr0]], #0x10\n"
+        "fmla vV41.4s, vU63.4s, vW33.4s\n"
+        "fmla vV42.4s, vU63.4s, vW32.4s\n"
+        "fmla vV43.4s, vU63.4s, vW31.4s\n"
+        "str qV43, [vptr3, %x[uvw_col_stride2]]\n"
+        "fmla vV11.4s, vU12.4s, vW12.4s\n"
+        "ldr qU21, [uptr1], #0x10\n"
+        "fmla vV12.4s, vU12.4s, vW11.4s\n"
+        "ldr qU31, [uptr2], #0x10\n"
+        "fmla vV11.4s, vU22.4s, vW22.4s\n"
+        "fmla vV12.4s, vU22.4s, vW21.4s\n"
+        "fmla vV21.4s, vU22.4s, vW12.4s\n"
+        "fmla vV22.4s, vU22.4s, vW11.4s\n"
+        "ldr qU41, [uptr3], #0x10\n"
+        "fmla vV11.4s, vU32.4s, vW32.4s\n"
+        "fmla vV12.4s, vU32.4s, vW31.4s\n"
+        "str qV12, [%x[vptr0], %x[uvw_col_stride1]]\n"
+        "fmla vV21.4s, vU32.4s, vW22.4s\n"
+        "fmla vV22.4s, vU32.4s, vW21.4s\n"
+        "fmla vV31.4s, vU32.4s, vW12.4s\n"
+        "fmla vV32.4s, vU32.4s, vW11.4s\n"
+        "ldr qU51, [uptr4], #0x10\n"
+        "fmla vV21.4s, vU42.4s, vW32.4s\n"
+        "fmla vV22.4s, vU42.4s, vW31.4s\n"
+        "str qV22, [vptr1, %x[uvw_col_stride1]]\n"
+        "fmla vV31.4s, vU42.4s, vW22.4s\n"
+        "fmla vV32.4s, vU42.4s, vW21.4s\n"
+        "subs %x[c4_rem], %x[c4_rem], #1\n"
+        "fmla vV41.4s, vU42.4s, vW12.4s\n"
+        "fmla vV42.4s, vU42.4s, vW11.4s\n"
+        "ldr qU61, [uptr5], #0x10\n"
+        "fmla vV31.4s, vU52.4s, vW32.4s\n"
+        "fmla vV32.4s, vU52.4s, vW31.4s\n"
+        "str qV32, [vptr2, %x[uvw_col_stride1]]\n"
+        "fmla vV41.4s, vU52.4s, vW22.4s\n"
+        "fmla vV42.4s, vU52.4s, vW21.4s\n"
+        "fmla vV41.4s, vU62.4s, vW32.4s\n"
+        "fmla vV42.4s, vU62.4s, vW31.4s\n"
+        "str qV42, [vptr3, %x[uvw_col_stride1]]\n"
+        "fmla vV11.4s, vU11.4s, vW11.4s\n"
+        "fmla vV11.4s, vU21.4s, vW21.4s\n"
+        "fmla vV21.4s, vU21.4s, vW11.4s\n"
+        "fmla vV11.4s, vU31.4s, vW31.4s\n"
+        "str qV11, [%x[vptr0]], #0x10\n"
+        "fmla vV21.4s, vU31.4s, vW21.4s\n"
+        "fmla vV31.4s, vU31.4s, vW11.4s\n"
+        "fmla vV21.4s, vU41.4s, vW31.4s\n"
+        "str qV21, [vptr1], #0x10\n"
+        "fmla vV31.4s, vU41.4s, vW21.4s\n"
+        "fmla vV41.4s, vU41.4s, vW11.4s\n"
+        "fmla vV31.4s, vU51.4s, vW31.4s\n"
+        "str qV31, [vptr2], #0x10\n"
+        "fmla vV41.4s, vU51.4s, vW21.4s\n"
+        "fmla vV41.4s, vU61.4s, vW31.4s\n"
+        "str qV41, [vptr3], #0x10\n"
+
+      ".unreq qW22\n" ".unreq qU64\n" ".unreq qU35\n" ".unreq qV41\n"
+      ".unreq qU34\n" ".unreq qU21\n" ".unreq qV43\n" ".unreq qW21\n"
+      ".unreq qU24\n" ".unreq qU54\n" ".unreq qV31\n" ".unreq qV12\n"
+      ".unreq qU61\n" ".unreq qU26\n" ".unreq qV32\n"
+      ".unreq qU36\n" ".unreq qU51\n" ".unreq qU66\n" ".unreq qU12\n"
+      ".unreq qV14\n" ".unreq qV11\n" ".unreq qU65\n"
+      ".unreq qU15\n" ".unreq qU22\n" ".unreq qU45\n"
+      ".unreq qV22\n" ".unreq qU14\n"
+      ".unreq qU44\n" ".unreq qU43\n" ".unreq qU11\n"
+      ".unreq qV24\n" ".unreq qV42\n" ".unreq qW31\n" ".unreq qW13\n"
+      ".unreq qU33\n" ".unreq qU62\n" ".unreq qU25\n" ".unreq qU56\n"
+      ".unreq qW33\n"
+      ".unreq qU42\n" ".unreq qU16\n" ".unreq qV44\n"
+      ".unreq qU63\n" ".unreq qU31\n" ".unreq qV34\n"
+      ".unreq qW11\n" ".unreq qU41\n" ".unreq qV13\n" ".unreq qV33\n"
+      ".unreq qU46\n" ".unreq qU32\n" ".unreq qU13\n"
+      ".unreq qW23\n" ".unreq qV23\n" ".unreq qV21\n" ".unreq qU55\n"
+      ".unreq qW12\n" ".unreq qW32\n" ".unreq qU23\n" ".unreq qU52\n"
+      ".unreq qU53\n" ".unreq vW22\n"
+      ".unreq vU64\n" ".unreq vU35\n" ".unreq vV41\n"
+      ".unreq vU34\n" ".unreq vU21\n" ".unreq vV43\n" ".unreq vW21\n"
+      ".unreq vU24\n" ".unreq vU54\n" ".unreq vV31\n"
+      ".unreq vV12\n" ".unreq vU61\n"
+      ".unreq vU26\n" ".unreq vV32\n"
+      ".unreq vU36\n" ".unreq vU51\n" ".unreq vU66\n" ".unreq vU12\n"
+      ".unreq vV14\n" ".unreq vV11\n" ".unreq vU65\n"
+      ".unreq vU15\n" ".unreq vU22\n" ".unreq vU45\n"
+      ".unreq vV22\n" ".unreq vU14\n"
+      ".unreq vU44\n" ".unreq vU43\n" ".unreq vU11\n"
+      ".unreq vV24\n" ".unreq vV42\n" ".unreq vW31\n" ".unreq vW13\n"
+      ".unreq vU33\n" ".unreq vU62\n" ".unreq vU25\n" ".unreq vU56\n"
+      ".unreq vW33\n" ".unreq vU42\n" ".unreq vU16\n" ".unreq vV44\n"
+      ".unreq vU63\n" ".unreq vU31\n" ".unreq vV34\n" ".unreq vW11\n"
+      ".unreq vU41\n" ".unreq vV13\n" ".unreq vV33\n"
+      ".unreq vU46\n" ".unreq vU32\n" ".unreq vU13\n" ".unreq vW23\n"
+      ".unreq vV23\n" ".unreq vV21\n" ".unreq vU55\n" ".unreq vW12\n"
+      ".unreq vW32\n" ".unreq vU23\n" ".unreq vU52\n" ".unreq vU53\n"
+      : [uptr0] "+r" (uptr0), [vptr0] "+r" (vptr0), [wptr0] "+r" (wptr0),
+        [c4_rem] "+r" (c4_rem)
+      : [u_row_stride] "r" (in_row_stride * sizeof(float)),
+        [v_row_stride] "r" (out_row_stride * sizeof(float)),
+        [w_row_stride] "r" (weight_row_stride * sizeof(float)),
+        [uvw_col_stride1] "r" (1 * in_col_stride * sizeof(float)),
+        [uvw_col_stride2] "r" (2 * in_col_stride * sizeof(float)),
+        [uvw_col_stride3] "r" (3 * in_col_stride * sizeof(float)),
+        [uvw_col_stride4] "r" (4 * in_col_stride * sizeof(float)),
+        [uvw_col_stride5] "r" (5 * in_col_stride * sizeof(float)),
+        [prftch] "i" (prefetch_depth * sizeof(float)),
+        [prftch_uvw_col_stride1] "r" ((prefetch_depth + 1 * in_col_stride) * sizeof(float)),
+        [prftch_uvw_col_stride2] "r" ((prefetch_depth + 2 * in_col_stride) * sizeof(float)),
+        [prftch_uvw_col_stride3] "r" ((prefetch_depth + 3 * in_col_stride) * sizeof(float)),
+        [prftch_uvw_col_stride4] "r" ((prefetch_depth + 4 * in_col_stride) * sizeof(float)),
+        [prftch_uvw_col_stride5] "r" ((prefetch_depth + 5 * in_col_stride) * sizeof(float))
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x0",
+        "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+    );
+  }
+  else if (channels_remaining >= 4)
+  {
+    int c4_rem = channels_remaining / 4;
+    channels_remaining %= 4;
+
+    asm volatile (
+      "qW22 .req q0\n" "vW22 .req v0\n"
+      "qU64 .req q1\n" "qU35 .req q1\n" "qV41 .req q1\n"
+      "vU64 .req v1\n" "vU35 .req v1\n" "vV41 .req v1\n"
+      "qU34 .req q2\n" "qU21 .req q2\n" "qV43 .req q2\n"
+      "vU34 .req v2\n" "vU21 .req v2\n" "vV43 .req v2\n"
+      "qW21 .req q3\n" "vW21 .req v3\n"
+      "qU24 .req q4\n" "qU54 .req q4\n" "qV31 .req q4\n"
+      "vU24 .req v4\n" "vU54 .req v4\n" "vV31 .req v4\n"
+      "qV12 .req q5\n" "qU61 .req q5\n" "vV12 .req v5\n" "vU61 .req v5\n"
+      "qU26 .req q6\n" "qV32 .req q6\n" "vU26 .req v6\n" "vV32 .req v6\n"
+      "qU36 .req q7\n" "qU51 .req q7\n" "qU66 .req q7\n" "qU12 .req q7\n"
+      "vU36 .req v7\n" "vU51 .req v7\n" "vU66 .req v7\n" "vU12 .req v7\n"
+      "qV14 .req q8\n" "qV11 .req q8\n" "qU65 .req q8\n"
+      "vV14 .req v8\n" "vV11 .req v8\n" "vU65 .req v8\n"
+      "qU15 .req q9\n" "qU22 .req q9\n" "qU45 .req q9\n"
+      "vU15 .req v9\n" "vU22 .req v9\n" "vU45 .req v9\n"
+      "qV22 .req q10\n" "qU14 .req q10\n" "vV22 .req v10\n" "vU14 .req v10\n"
+      "qU44 .req q11\n" "qU43 .req q11\n" "qU11 .req q11\n"
+      "vU44 .req v11\n" "vU43 .req v11\n" "vU11 .req v11\n"
+      "qV24 .req q12\n" "qV42 .req q12\n" "vV24 .req v12\n" "vV42 .req v12\n"
+      "qW31 .req q13\n" "vW31 .req v13\n" "qW13 .req q14\n" "vW13 .req v14\n"
+      "qU33 .req q15\n" "qU62 .req q15\n" "qU25 .req q15\n" "qU56 .req q15\n"
+      "vU33 .req v15\n" "vU62 .req v15\n" "vU25 .req v15\n" "vU56 .req v15\n"
+      "qW33 .req q16\n" "vW33 .req v16\n"
+      "qU42 .req q17\n" "qU16 .req q17\n" "qV44 .req q17\n"
+      "vU42 .req v17\n" "vU16 .req v17\n" "vV44 .req v17\n"
+      "qU63 .req q18\n" "qU31 .req q18\n" "qV34 .req q18\n"
+      "vU63 .req v18\n" "vU31 .req v18\n" "vV34 .req v18\n"
+      "qW11 .req q19\n" "vW11 .req v19\n" "qU41 .req q20\n" "qV13 .req q20\n"
+      "vU41 .req v20\n" "vV13 .req v20\n" "qV33 .req q21\n" "vV33 .req v21\n"
+      "qU46 .req q22\n" "qU32 .req q22\n" "qU13 .req q22\n"
+      "vU46 .req v22\n" "vU32 .req v22\n" "vU13 .req v22\n" "qW23 .req q23\n"
+      "vW23 .req v23\n" "qV23 .req q24\n" "vV23 .req v24\n"
+      "qV21 .req q25\n" "qU55 .req q25\n" "vV21 .req v25\n" "vU55 .req v25\n"
+      "qW12 .req q26\n" "vW12 .req v26\n" "qW32 .req q27\n" "vW32 .req v27\n"
+      "qU23 .req q28\n" "qU52 .req q28\n"
+      "vU23 .req v28\n" "vU52 .req v28\n" "qU53 .req q29\n" "vU53 .req v29\n"
+
+      "uptr1 .req x0\n"
+      "uptr2 .req x1\n"
+      "uptr3 .req x2\n"
+      "uptr4 .req x3\n"
+      "uptr5 .req x4\n"
+
+      "vptr1 .req x5\n"
+      "vptr2 .req x6\n"
+      "vptr3 .req x7\n"
+
+      "wptr1 .req x8\n"
+      "wptr2 .req x9\n"
+
+      "u_col_stride2 .req x10\n"
+      "u_col_stride3 .req x11\n"
+      "u_col_stride4 .req x12\n"
+      "u_col_stride5 .req x13\n"
+
+      "v_col_stride2 .req x14\n"
+      "v_col_stride3 .req x15\n"
+
+      "w_col_stride2 .req x16\n"
+
+      // Prepare pointers and strides
+      "add uptr1, %x[uptr0], %x[u_row_stride]\n"
+      "add uptr2,    uptr1 , %x[u_row_stride]\n"
+      "add uptr3,    uptr2 , %x[u_row_stride]\n"
+      "add uptr4,    uptr3 , %x[u_row_stride]\n"
+      "add uptr5,    uptr4 , %x[u_row_stride]\n"
+
+      "add vptr1, %x[vptr0], %x[v_row_stride]\n"
+      "add vptr2,    vptr1 , %x[v_row_stride]\n"
+      "add vptr3,    vptr2 , %x[v_row_stride]\n"
+
+      "add wptr1, %x[wptr0], %x[w_row_stride]\n"
+      "add wptr2,    wptr1 , %x[w_row_stride]\n"
+
+      "add u_col_stride2, %x[u_col_stride1], %x[u_col_stride1]\n"
+      "add u_col_stride3,    u_col_stride2 , %x[u_col_stride1]\n"
+      "add u_col_stride4,    u_col_stride3 , %x[u_col_stride1]\n"
+      "add u_col_stride5,    u_col_stride4 , %x[u_col_stride1]\n"
+
+      "add v_col_stride2, %x[v_col_stride1], %x[v_col_stride1]\n"
+      "add v_col_stride3,    v_col_stride2 , %x[v_col_stride1]\n"
+
+      "add w_col_stride2, %x[w_col_stride1], %x[w_col_stride1]\n"
+
+      // Load initial operands
+      "ldr qU16, [%x[uptr0], u_col_stride5]\n"
+      "ldr qW13, [%x[wptr0], w_col_stride2]\n"
+      "subs %x[c4_rem], %x[c4_rem], #1\n"
+      "ldr qU15, [%x[uptr0], u_col_stride4]\n"
+      "ldr qW23, [wptr1, w_col_stride2]\n"
+      "ldr qU14, [%x[uptr0], u_col_stride3]\n"
+      "ldr qW33, [wptr2, w_col_stride2]\n"
+      "ldr qU26, [uptr1, u_col_stride5]\n"
+      "ldr qW12, [%x[wptr0], %x[w_col_stride1]]\n"
+      "ldr qU25, [uptr1, u_col_stride4]\n"
+      "ldr qW22, [wptr1, %x[w_col_stride1]]\n"
+      "ldr qU36, [uptr2, u_col_stride5]\n"
+      "ldr qW32, [wptr2, %x[w_col_stride1]]\n"
+      "ldr qW11, [%x[wptr0]], #0x10\n"
+      "fmul vV14.4s, vU16.4s, vW13.4s\n"
+      "ldr qU24, [uptr1, u_col_stride3]\n"
+      "fmul vV13.4s, vU15.4s, vW13.4s\n"
+      "ldr qW31, [wptr2], #0x10\n"
+      "fmla vV14.4s, vU15.4s, vW12.4s\n"
+      "ldr qW21, [wptr1], #0x10\n"
+      "fmul vV12.4s, vU14.4s, vW13.4s\n"
+      "ldr qU34, [uptr2, u_col_stride3]\n"
+      "fmla vV13.4s, vU14.4s, vW12.4s\n"
+      "ldr qU46, [uptr3, u_col_stride5]\n"
+      "fmla vV14.4s, vU14.4s, vW11.4s\n"
+      "ldr qU45, [uptr3, u_col_stride4]\n"
+      "fmla vV14.4s, vU26.4s, vW23.4s\n"
+      "ldr qU35, [uptr2, u_col_stride4]\n"
+      "fmul vV24.4s, vU26.4s, vW13.4s\n"
+      "ldr qU44, [uptr3, u_col_stride3]\n"
+      "fmla vV13.4s, vU25.4s, vW23.4s\n"
+      "beq 2f\n"  // Single iteration only
+
+      "1:"  // Loop body
+        "fmla vV14.4s, vU25.4s, vW22.4s\n"
+        "prfm pldl1keep, [%x[wptr0]]\n"
+        "fmul vV23.4s, vU25.4s, vW13.4s\n"
+        "prfm pldl1keep, [%x[wptr0], %x[w_col_stride1]]\n"
+        "fmla vV24.4s, vU25.4s, vW12.4s\n"
+        "ldr qU56, [uptr4, u_col_stride5]\n"
+        "fmla vV12.4s, vU24.4s, vW23.4s\n"
+        "prfm pldl1keep, [%x[wptr0],    w_col_stride2 ]\n"
+        "fmla vV13.4s, vU24.4s, vW22.4s\n"
+        "prfm pldl1keep, [   wptr1 ]\n"
+        "fmla vV14.4s, vU24.4s, vW21.4s\n"
+        "prfm pldl1keep, [   wptr1 , %x[w_col_stride1]]\n"
+        "fmul vV22.4s, vU24.4s, vW13.4s\n"
+        "prfm pldl1keep, [   wptr1 ,    w_col_stride2 ]\n"
+        "fmla vV23.4s, vU24.4s, vW12.4s\n"
+        "prfm pldl1keep, [   wptr2 ]\n"
+        "fmla vV24.4s, vU24.4s, vW11.4s\n"
+        "ldr qU55, [uptr4, u_col_stride4]\n"
+        "fmla vV14.4s, vU36.4s, vW33.4s\n"
+        "prfm pldl1keep, [   wptr2 , %x[w_col_stride1]]\n"
+        "fmla vV24.4s, vU36.4s, vW23.4s\n"
+        "prfm pldl1keep, [   wptr2 ,    w_col_stride2 ]\n"
+        "fmul vV34.4s, vU36.4s, vW13.4s\n"
+        "ldr qU54, [uptr4, u_col_stride3]\n"
+        "fmla vV13.4s, vU35.4s, vW33.4s\n"
+        "prfm pldl1keep, [   uptr2 , %x[u_col_stride1]]\n"
+        "fmla vV14.4s, vU35.4s, vW32.4s\n"
+        "prfm pldl1keep, [   uptr2 ,    u_col_stride2 ]\n"
+        "fmla vV23.4s, vU35.4s, vW23.4s\n"
+        "prfm pldl1keep, [   uptr2 ,    u_col_stride3 ]\n"
+        "fmla vV24.4s, vU35.4s, vW22.4s\n"
+        "prfm pldl1keep, [   uptr2 ,    u_col_stride4 ]\n"
+        "fmul vV33.4s, vU35.4s, vW13.4s\n"
+        "prfm pldl1keep, [   uptr2 ,    u_col_stride5 ]\n"
+        "fmla vV34.4s, vU35.4s, vW12.4s\n"
+        "ldr qU66, [uptr5, u_col_stride5]\n"
+        "fmla vV12.4s, vU34.4s, vW33.4s\n"
+        "prfm pldl1keep, [   uptr3 ]\n"
+        "fmla vV13.4s, vU34.4s, vW32.4s\n"
+        "prfm pldl1keep, [   uptr3 , %x[u_col_stride1]]\n"
+        "fmla vV14.4s, vU34.4s, vW31.4s\n"
+        "str qV14, [%x[vptr0], v_col_stride3]\n"
+        "fmla vV22.4s, vU34.4s, vW23.4s\n"
+        "prfm pldl1keep, [   uptr3 ,    u_col_stride2 ]\n"
+        "fmla vV23.4s, vU34.4s, vW22.4s\n"
+        "prfm pldl1keep, [   uptr3 ,    u_col_stride3 ]\n"
+        "fmla vV24.4s, vU34.4s, vW21.4s\n"
+        "prfm pldl1keep, [   uptr3 ,    u_col_stride4 ]\n"
+        "fmul vV32.4s, vU34.4s, vW13.4s\n"
+        "prfm pldl1keep, [   uptr3 ,    u_col_stride5 ]\n"
+        "fmla vV33.4s, vU34.4s, vW12.4s\n"
+        "prfm pldl1keep, [   uptr4 ]\n"
+        "fmla vV34.4s, vU34.4s, vW11.4s\n"
+        "ldr qU65, [uptr5, u_col_stride4]\n"
+        "fmla vV24.4s, vU46.4s, vW33.4s\n"
+        "prfm pldl1keep, [   uptr4 , %x[u_col_stride1]]\n"
+        "fmla vV34.4s, vU46.4s, vW23.4s\n"
+        "prfm pldl1keep, [   uptr4 ,    u_col_stride2 ]\n"
+        "fmul vV44.4s, vU46.4s, vW13.4s\n"
+        "ldr qU64, [uptr5, u_col_stride3]\n"
+        "fmla vV23.4s, vU45.4s, vW33.4s\n"
+        "prfm pldl1keep, [   uptr4 ,    u_col_stride3 ]\n"
+        "fmla vV24.4s, vU45.4s, vW32.4s\n"
+        "prfm pldl1keep, [   uptr4 ,    u_col_stride4 ]\n"
+        "fmla vV33.4s, vU45.4s, vW23.4s\n"
+        "prfm pldl1keep, [   uptr4 ,    u_col_stride5 ]\n"
+        "fmla vV34.4s, vU45.4s, vW22.4s\n"
+        "prfm pldl1keep, [   uptr5 ]\n"
+        "fmul vV43.4s, vU45.4s, vW13.4s\n"
+        "prfm pldl1keep, [   uptr5 , %x[u_col_stride1]]\n"
+        "fmla vV44.4s, vU45.4s, vW12.4s\n"
+        "ldr qU13, [%x[uptr0], u_col_stride2]\n"
+        "fmla vV22.4s, vU44.4s, vW33.4s\n"
+        "prfm pldl1keep, [   uptr5 ,    u_col_stride2 ]\n"
+        "fmla vV23.4s, vU44.4s, vW32.4s\n"
+        "prfm pldl1keep, [   uptr5 ,    u_col_stride3 ]\n"
+        "fmla vV24.4s, vU44.4s, vW31.4s\n"
+        "str qV24, [vptr1, v_col_stride3]\n"
+        "fmla vV32.4s, vU44.4s, vW23.4s\n"
+        "prfm pldl1keep, [   uptr5 ,    u_col_stride4 ]\n"
+        "fmla vV33.4s, vU44.4s, vW22.4s\n"
+        "prfm pldl1keep, [   uptr5 ,    u_col_stride5 ]\n"
+        "fmla vV34.4s, vU44.4s, vW21.4s\n"
+        "prfm pstl1keep, [%x[vptr0]]\n"
+        "fmul vV42.4s, vU44.4s, vW13.4s\n"
+        "prfm pstl1keep, [%x[vptr0], %x[v_col_stride1]]\n"
+        "fmla vV43.4s, vU44.4s, vW12.4s\n"
+        "prfm pstl1keep, [%x[vptr0],    v_col_stride2 ]\n"
+        "fmla vV44.4s, vU44.4s, vW11.4s\n"
+        "ldr qU23, [uptr1, u_col_stride2]\n"
+        "fmla vV34.4s, vU56.4s, vW33.4s\n"
+        "prfm pstl1keep, [%x[vptr0],    v_col_stride3 ]\n"
+        "fmla vV44.4s, vU56.4s, vW23.4s\n"
+        "ldr qU33, [uptr2, u_col_stride2]\n"
+        "fmla vV33.4s, vU55.4s, vW33.4s\n"
+        "prfm pstl1keep, [   vptr1 ]\n"
+        "fmla vV34.4s, vU55.4s, vW32.4s\n"
+        "prfm pstl1keep, [   vptr1 , %x[v_col_stride1]]\n"
+        "fmla vV43.4s, vU55.4s, vW23.4s\n"
+        "prfm pstl1keep, [   vptr1 ,    v_col_stride2 ]\n"
+        "fmla vV44.4s, vU55.4s, vW22.4s\n"
+        "ldr qU43, [uptr3, u_col_stride2]\n"
+        "fmla vV32.4s, vU54.4s, vW33.4s\n"
+        "prfm pstl1keep, [   vptr1 ,    v_col_stride3 ]\n"
+        "fmla vV33.4s, vU54.4s, vW32.4s\n"
+        "prfm pstl1keep, [   vptr2 ]\n"
+        "fmla vV34.4s, vU54.4s, vW31.4s\n"
+        "str qV34, [vptr2, v_col_stride3]\n"
+        "fmla vV42.4s, vU54.4s, vW23.4s\n"
+        "prfm pstl1keep, [   vptr2 , %x[v_col_stride1]]\n"
+        "fmla vV43.4s, vU54.4s, vW22.4s\n"
+        "prfm pstl1keep, [   vptr2 ,    v_col_stride2 ]\n"
+        "fmla vV44.4s, vU54.4s, vW21.4s\n"
+        "ldr qU53, [uptr4, u_col_stride2]\n"
+        "fmla vV44.4s, vU66.4s, vW33.4s\n"
+        "ldr qU63, [uptr5, u_col_stride2]\n"
+        "fmla vV43.4s, vU65.4s, vW33.4s\n"
+        "prfm pstl1keep, [   vptr2 ,    v_col_stride3 ]\n"
+        "fmla vV44.4s, vU65.4s, vW32.4s\n"
+        "ldr qU12, [%x[uptr0], %x[u_col_stride1]]\n"
+        "fmla vV42.4s, vU64.4s, vW33.4s\n"
+        "prfm pstl1keep, [   vptr3 ]\n"
+        "fmla vV43.4s, vU64.4s, vW32.4s\n"
+        "prfm pstl1keep, [   vptr3 , %x[v_col_stride1]]\n"
+        "fmla vV44.4s, vU64.4s, vW31.4s\n"
+        "str qV44, [vptr3, v_col_stride3]\n"
+        "fmul vV11.4s, vU13.4s, vW13.4s\n"
+        "ldr qU22, [uptr1, %x[u_col_stride1]]\n"
+        "fmla vV12.4s, vU13.4s, vW12.4s\n"
+        "prfm pstl1keep, [   vptr3 ,    v_col_stride2 ]\n"
+        "fmla vV13.4s, vU13.4s, vW11.4s\n"
+        "ldr qU32, [uptr2, %x[u_col_stride1]]\n"
+        "fmla vV11.4s, vU23.4s, vW23.4s\n"
+        "prfm pstl1keep, [   vptr3 ,    v_col_stride3 ]\n"
+        "fmla vV12.4s, vU23.4s, vW22.4s\n"
+        "fmla vV13.4s, vU23.4s, vW21.4s\n"
+        "fmul vV21.4s, vU23.4s, vW13.4s\n"
+        "fmla vV22.4s, vU23.4s, vW12.4s\n"
+        "fmla vV23.4s, vU23.4s, vW11.4s\n"
+        "ldr qU42, [uptr3, %x[u_col_stride1]]\n"
+        "fmla vV11.4s, vU33.4s, vW33.4s\n"
+        "fmla vV12.4s, vU33.4s, vW32.4s\n"
+        "fmla vV13.4s, vU33.4s, vW31.4s\n"
+        "str qV13, [%x[vptr0], v_col_stride2]\n"
+        "fmla vV21.4s, vU33.4s, vW23.4s\n"
+        "fmla vV22.4s, vU33.4s, vW22.4s\n"
+        "fmla vV23.4s, vU33.4s, vW21.4s\n"
+        "fmul vV31.4s, vU33.4s, vW13.4s\n"
+        "fmla vV32.4s, vU33.4s, vW12.4s\n"
+        "fmla vV33.4s, vU33.4s, vW11.4s\n"
+        "ldr qU52, [uptr4, %x[u_col_stride1]]\n"
+        "fmla vV21.4s, vU43.4s, vW33.4s\n"
+        "fmla vV22.4s, vU43.4s, vW32.4s\n"
+        "fmla vV23.4s, vU43.4s, vW31.4s\n"
+        "str qV23, [vptr1, v_col_stride2]\n"
+        "fmla vV31.4s, vU43.4s, vW23.4s\n"
+        "fmla vV32.4s, vU43.4s, vW22.4s\n"
+        "fmla vV33.4s, vU43.4s, vW21.4s\n"
+        "fmul vV41.4s, vU43.4s, vW13.4s\n"
+        "ldr qW13, [%x[wptr0], w_col_stride2]\n"
+        "fmla vV42.4s, vU43.4s, vW12.4s\n"
+        "fmla vV43.4s, vU43.4s, vW11.4s\n"
+        "ldr qU62, [uptr5, %x[u_col_stride1]]\n"
+        "fmla vV31.4s, vU53.4s, vW33.4s\n"
+        "fmla vV32.4s, vU53.4s, vW32.4s\n"
+        "fmla vV33.4s, vU53.4s, vW31.4s\n"
+        "str qV33, [vptr2, v_col_stride2]\n"
+        "fmla vV41.4s, vU53.4s, vW23.4s\n"
+        "ldr qW23, [wptr1, w_col_stride2]\n"
+        "fmla vV42.4s, vU53.4s, vW22.4s\n"
+        "fmla vV43.4s, vU53.4s, vW21.4s\n"
+        "ldr qU11, [%x[uptr0]], #0x10\n"
+        "fmla vV41.4s, vU63.4s, vW33.4s\n"
+        "ldr qW33, [wptr2, w_col_stride2]\n"
+        "fmla vV42.4s, vU63.4s, vW32.4s\n"
+        "prfm pldl1keep, [%x[uptr0]]\n"
+        "fmla vV43.4s, vU63.4s, vW31.4s\n"
+        "str qV43, [vptr3, v_col_stride2]\n"
+        "fmla vV11.4s, vU12.4s, vW12.4s\n"
+        "ldr qU21, [uptr1], #0x10\n"
+        "fmla vV12.4s, vU12.4s, vW11.4s\n"
+        "ldr qU31, [uptr2], #0x10\n"
+        "fmla vV11.4s, vU22.4s, vW22.4s\n"
+        "prfm pldl1keep, [%x[uptr0], %x[u_col_stride1]]\n"
+        "fmla vV12.4s, vU22.4s, vW21.4s\n"
+        "prfm pldl1keep, [%x[uptr0],    u_col_stride2 ]\n"
+        "fmla vV21.4s, vU22.4s, vW12.4s\n"
+        "prfm pldl1keep, [%x[uptr0],    u_col_stride3 ]\n"
+        "fmla vV22.4s, vU22.4s, vW11.4s\n"
+        "ldr qU41, [uptr3], #0x10\n"
+        "fmla vV11.4s, vU32.4s, vW32.4s\n"
+        "prfm pldl1keep, [%x[uptr0],    u_col_stride4 ]\n"
+        "fmla vV12.4s, vU32.4s, vW31.4s\n"
+        "str qV12, [%x[vptr0], %x[v_col_stride1]]\n"
+        "fmla vV21.4s, vU32.4s, vW22.4s\n"
+        "prfm pldl1keep, [%x[uptr0],    u_col_stride5 ]\n"
+        "fmla vV22.4s, vU32.4s, vW21.4s\n"
+        "prfm pldl1keep, [   uptr1 ]\n"
+        "fmla vV31.4s, vU32.4s, vW12.4s\n"
+        "prfm pldl1keep, [   uptr1 , %x[u_col_stride1]]\n"
+        "fmla vV32.4s, vU32.4s, vW11.4s\n"
+        "ldr qU51, [uptr4], #0x10\n"
+        "fmla vV21.4s, vU42.4s, vW32.4s\n"
+        "prfm pldl1keep, [   uptr1 ,    u_col_stride2 ]\n"
+        "fmla vV22.4s, vU42.4s, vW31.4s\n"
+        "str qV22, [vptr1, %x[v_col_stride1]]\n"
+        "fmla vV31.4s, vU42.4s, vW22.4s\n"
+        "prfm pldl1keep, [   uptr1 ,    u_col_stride3 ]\n"
+        "fmla vV32.4s, vU42.4s, vW21.4s\n"
+        "subs %x[c4_rem], %x[c4_rem], #1\n"
+        "fmla vV41.4s, vU42.4s, vW12.4s\n"
+        "ldr qW12, [%x[wptr0], %x[w_col_stride1]]\n"
+        "fmla vV42.4s, vU42.4s, vW11.4s\n"
+        "ldr qU61, [uptr5], #0x10\n"
+        "fmla vV31.4s, vU52.4s, vW32.4s\n"
+        "prfm pldl1keep, [   uptr1 ,    u_col_stride4 ]\n"
+        "fmla vV32.4s, vU52.4s, vW31.4s\n"
+        "str qV32, [vptr2, %x[v_col_stride1]]\n"
+        "fmla vV41.4s, vU52.4s, vW22.4s\n"
+        "ldr qW22, [wptr1, %x[w_col_stride1]]\n"
+        "fmla vV42.4s, vU52.4s, vW21.4s\n"
+        "ldr qU16, [%x[uptr0], u_col_stride5]\n"
+        "fmla vV41.4s, vU62.4s, vW32.4s\n"
+        "ldr qW32, [wptr2, %x[w_col_stride1]]\n"
+        "fmla vV42.4s, vU62.4s, vW31.4s\n"
+        "str qV42, [vptr3, %x[v_col_stride1]]\n"
+        "fmla vV11.4s, vU11.4s, vW11.4s\n"
+        "ldr qU15, [%x[uptr0], u_col_stride4]\n"
+        "fmla vV11.4s, vU21.4s, vW21.4s\n"
+        "ldr qU14, [%x[uptr0], u_col_stride3]\n"
+        "fmla vV21.4s, vU21.4s, vW11.4s\n"
+        "ldr qU26, [uptr1, u_col_stride5]\n"
+        "fmla vV11.4s, vU31.4s, vW31.4s\n"
+        "str qV11, [%x[vptr0]], #0x10\n"
+        "fmla vV21.4s, vU31.4s, vW21.4s\n"
+        "prfm pldl1keep, [   uptr1 ,    u_col_stride5 ]\n"
+        "fmla vV31.4s, vU31.4s, vW11.4s\n"
+        "ldr qU25, [uptr1, u_col_stride4]\n"
+        "fmla vV21.4s, vU41.4s, vW31.4s\n"
+        "str qV21, [vptr1], #0x10\n"
+        "fmla vV31.4s, vU41.4s, vW21.4s\n"
+        "prfm pldl1keep, [   uptr2 ]\n"
+        "fmla vV41.4s, vU41.4s, vW11.4s\n"
+        "ldr qW11, [%x[wptr0]], #0x10\n"
+        "fmla vV31.4s, vU51.4s, vW31.4s\n"
+        "str qV31, [vptr2], #0x10\n"
+        "fmla vV41.4s, vU51.4s, vW21.4s\n"
+        "ldr qU36, [uptr2, u_col_stride5]\n"
+        "fmla vV41.4s, vU61.4s, vW31.4s\n"
+        "str qV41, [vptr3], #0x10\n"
+        "fmul vV14.4s, vU16.4s, vW13.4s\n"
+        "ldr qU24, [uptr1, u_col_stride3]\n"
+        "fmul vV13.4s, vU15.4s, vW13.4s\n"
+        "ldr qW31, [wptr2], #0x10\n"
+        "fmla vV14.4s, vU15.4s, vW12.4s\n"
+        "ldr qW21, [wptr1], #0x10\n"
+        "fmul vV12.4s, vU14.4s, vW13.4s\n"
+        "ldr qU34, [uptr2, u_col_stride3]\n"
+        "fmla vV13.4s, vU14.4s, vW12.4s\n"
+        "ldr qU46, [uptr3, u_col_stride5]\n"
+        "fmla vV14.4s, vU14.4s, vW11.4s\n"
+        "ldr qU45, [uptr3, u_col_stride4]\n"
+        "fmla vV14.4s, vU26.4s, vW23.4s\n"
+        "ldr qU35, [uptr2, u_col_stride4]\n"
+        "fmul vV24.4s, vU26.4s, vW13.4s\n"
+        "ldr qU44, [uptr3, u_col_stride3]\n"
+        "fmla vV13.4s, vU25.4s, vW23.4s\n"
+        "bne 1b\n"
+
+      "2:"  // Final iteration
+        "fmla vV14.4s, vU25.4s, vW22.4s\n"
+        "fmul vV23.4s, vU25.4s, vW13.4s\n"
+        "fmla vV24.4s, vU25.4s, vW12.4s\n"
+        "ldr qU56, [uptr4, u_col_stride5]\n"
+        "fmla vV12.4s, vU24.4s, vW23.4s\n"
+        "fmla vV13.4s, vU24.4s, vW22.4s\n"
+        "fmla vV14.4s, vU24.4s, vW21.4s\n"
+        "fmul vV22.4s, vU24.4s, vW13.4s\n"
+        "fmla vV23.4s, vU24.4s, vW12.4s\n"
+        "fmla vV24.4s, vU24.4s, vW11.4s\n"
+        "ldr qU55, [uptr4, u_col_stride4]\n"
+        "fmla vV14.4s, vU36.4s, vW33.4s\n"
+        "fmla vV24.4s, vU36.4s, vW23.4s\n"
+        "fmul vV34.4s, vU36.4s, vW13.4s\n"
+        "ldr qU54, [uptr4, u_col_stride3]\n"
+        "fmla vV13.4s, vU35.4s, vW33.4s\n"
+        "fmla vV14.4s, vU35.4s, vW32.4s\n"
+        "fmla vV23.4s, vU35.4s, vW23.4s\n"
+        "fmla vV24.4s, vU35.4s, vW22.4s\n"
+        "fmul vV33.4s, vU35.4s, vW13.4s\n"
+        "fmla vV34.4s, vU35.4s, vW12.4s\n"
+        "ldr qU66, [uptr5, u_col_stride5]\n"
+        "fmla vV12.4s, vU34.4s, vW33.4s\n"
+        "fmla vV13.4s, vU34.4s, vW32.4s\n"
+        "fmla vV14.4s, vU34.4s, vW31.4s\n"
+        "str qV14, [%x[vptr0], v_col_stride3]\n"
+        "fmla vV22.4s, vU34.4s, vW23.4s\n"
+        "fmla vV23.4s, vU34.4s, vW22.4s\n"
+        "fmla vV24.4s, vU34.4s, vW21.4s\n"
+        "fmul vV32.4s, vU34.4s, vW13.4s\n"
+        "fmla vV33.4s, vU34.4s, vW12.4s\n"
+        "fmla vV34.4s, vU34.4s, vW11.4s\n"
+        "ldr qU65, [uptr5, u_col_stride4]\n"
+        "fmla vV24.4s, vU46.4s, vW33.4s\n"
+        "fmla vV34.4s, vU46.4s, vW23.4s\n"
+        "fmul vV44.4s, vU46.4s, vW13.4s\n"
+        "ldr qU64, [uptr5, u_col_stride3]\n"
+        "fmla vV23.4s, vU45.4s, vW33.4s\n"
+        "fmla vV24.4s, vU45.4s, vW32.4s\n"
+        "fmla vV33.4s, vU45.4s, vW23.4s\n"
+        "fmla vV34.4s, vU45.4s, vW22.4s\n"
+        "fmul vV43.4s, vU45.4s, vW13.4s\n"
+        "fmla vV44.4s, vU45.4s, vW12.4s\n"
+        "ldr qU13, [%x[uptr0], u_col_stride2]\n"
+        "fmla vV22.4s, vU44.4s, vW33.4s\n"
+        "fmla vV23.4s, vU44.4s, vW32.4s\n"
+        "fmla vV24.4s, vU44.4s, vW31.4s\n"
+        "str qV24, [vptr1, v_col_stride3]\n"
+        "fmla vV32.4s, vU44.4s, vW23.4s\n"
+        "fmla vV33.4s, vU44.4s, vW22.4s\n"
+        "fmla vV34.4s, vU44.4s, vW21.4s\n"
+        "fmul vV42.4s, vU44.4s, vW13.4s\n"
+        "fmla vV43.4s, vU44.4s, vW12.4s\n"
+        "fmla vV44.4s, vU44.4s, vW11.4s\n"
+        "ldr qU23, [uptr1, u_col_stride2]\n"
+        "fmla vV34.4s, vU56.4s, vW33.4s\n"
+        "fmla vV44.4s, vU56.4s, vW23.4s\n"
+        "ldr qU33, [uptr2, u_col_stride2]\n"
+        "fmla vV33.4s, vU55.4s, vW33.4s\n"
+        "fmla vV34.4s, vU55.4s, vW32.4s\n"
+        "fmla vV43.4s, vU55.4s, vW23.4s\n"
+        "fmla vV44.4s, vU55.4s, vW22.4s\n"
+        "ldr qU43, [uptr3, u_col_stride2]\n"
+        "fmla vV32.4s, vU54.4s, vW33.4s\n"
+        "fmla vV33.4s, vU54.4s, vW32.4s\n"
+        "fmla vV34.4s, vU54.4s, vW31.4s\n"
+        "str qV34, [vptr2, v_col_stride3]\n"
+        "fmla vV42.4s, vU54.4s, vW23.4s\n"
+        "fmla vV43.4s, vU54.4s, vW22.4s\n"
+        "fmla vV44.4s, vU54.4s, vW21.4s\n"
+        "ldr qU53, [uptr4, u_col_stride2]\n"
+        "fmla vV44.4s, vU66.4s, vW33.4s\n"
+        "ldr qU63, [uptr5, u_col_stride2]\n"
+        "fmla vV43.4s, vU65.4s, vW33.4s\n"
+        "fmla vV44.4s, vU65.4s, vW32.4s\n"
+        "ldr qU12, [%x[uptr0], %x[u_col_stride1]]\n"
+        "fmla vV42.4s, vU64.4s, vW33.4s\n"
+        "fmla vV43.4s, vU64.4s, vW32.4s\n"
+        "fmla vV44.4s, vU64.4s, vW31.4s\n"
+        "str qV44, [vptr3, v_col_stride3]\n"
+        "fmul vV11.4s, vU13.4s, vW13.4s\n"
+        "ldr qU22, [uptr1, %x[u_col_stride1]]\n"
+        "fmla vV12.4s, vU13.4s, vW12.4s\n"
+        "fmla vV13.4s, vU13.4s, vW11.4s\n"
+        "ldr qU32, [uptr2, %x[u_col_stride1]]\n"
+        "fmla vV11.4s, vU23.4s, vW23.4s\n"
+        "fmla vV12.4s, vU23.4s, vW22.4s\n"
+        "fmla vV13.4s, vU23.4s, vW21.4s\n"
+        "fmul vV21.4s, vU23.4s, vW13.4s\n"
+        "fmla vV22.4s, vU23.4s, vW12.4s\n"
+        "fmla vV23.4s, vU23.4s, vW11.4s\n"
+        "ldr qU42, [uptr3, %x[u_col_stride1]]\n"
+        "fmla vV11.4s, vU33.4s, vW33.4s\n"
+        "fmla vV12.4s, vU33.4s, vW32.4s\n"
+        "fmla vV13.4s, vU33.4s, vW31.4s\n"
+        "str qV13, [%x[vptr0], v_col_stride2]\n"
+        "fmla vV21.4s, vU33.4s, vW23.4s\n"
+        "fmla vV22.4s, vU33.4s, vW22.4s\n"
+        "fmla vV23.4s, vU33.4s, vW21.4s\n"
+        "fmul vV31.4s, vU33.4s, vW13.4s\n"
+        "fmla vV32.4s, vU33.4s, vW12.4s\n"
+        "fmla vV33.4s, vU33.4s, vW11.4s\n"
+        "ldr qU52, [uptr4, %x[u_col_stride1]]\n"
+        "fmla vV21.4s, vU43.4s, vW33.4s\n"
+        "fmla vV22.4s, vU43.4s, vW32.4s\n"
+        "fmla vV23.4s, vU43.4s, vW31.4s\n"
+        "str qV23, [vptr1, v_col_stride2]\n"
+        "fmla vV31.4s, vU43.4s, vW23.4s\n"
+        "fmla vV32.4s, vU43.4s, vW22.4s\n"
+        "fmla vV33.4s, vU43.4s, vW21.4s\n"
+        "fmul vV41.4s, vU43.4s, vW13.4s\n"
+        "fmla vV42.4s, vU43.4s, vW12.4s\n"
+        "fmla vV43.4s, vU43.4s, vW11.4s\n"
+        "ldr qU62, [uptr5, %x[u_col_stride1]]\n"
+        "fmla vV31.4s, vU53.4s, vW33.4s\n"
+        "fmla vV32.4s, vU53.4s, vW32.4s\n"
+        "fmla vV33.4s, vU53.4s, vW31.4s\n"
+        "str qV33, [vptr2, v_col_stride2]\n"
+        "fmla vV41.4s, vU53.4s, vW23.4s\n"
+        "fmla vV42.4s, vU53.4s, vW22.4s\n"
+        "fmla vV43.4s, vU53.4s, vW21.4s\n"
+        "ldr qU11, [%x[uptr0]], #0x10\n"
+        "fmla vV41.4s, vU63.4s, vW33.4s\n"
+        "fmla vV42.4s, vU63.4s, vW32.4s\n"
+        "fmla vV43.4s, vU63.4s, vW31.4s\n"
+        "str qV43, [vptr3, v_col_stride2]\n"
+        "fmla vV11.4s, vU12.4s, vW12.4s\n"
+        "ldr qU21, [uptr1], #0x10\n"
+        "fmla vV12.4s, vU12.4s, vW11.4s\n"
+        "ldr qU31, [uptr2], #0x10\n"
+        "fmla vV11.4s, vU22.4s, vW22.4s\n"
+        "fmla vV12.4s, vU22.4s, vW21.4s\n"
+        "fmla vV21.4s, vU22.4s, vW12.4s\n"
+        "fmla vV22.4s, vU22.4s, vW11.4s\n"
+        "ldr qU41, [uptr3], #0x10\n"
+        "fmla vV11.4s, vU32.4s, vW32.4s\n"
+        "fmla vV12.4s, vU32.4s, vW31.4s\n"
+        "str qV12, [%x[vptr0], %x[v_col_stride1]]\n"
+        "fmla vV21.4s, vU32.4s, vW22.4s\n"
+        "fmla vV22.4s, vU32.4s, vW21.4s\n"
+        "fmla vV31.4s, vU32.4s, vW12.4s\n"
+        "fmla vV32.4s, vU32.4s, vW11.4s\n"
+        "ldr qU51, [uptr4], #0x10\n"
+        "fmla vV21.4s, vU42.4s, vW32.4s\n"
+        "fmla vV22.4s, vU42.4s, vW31.4s\n"
+        "str qV22, [vptr1, %x[v_col_stride1]]\n"
+        "fmla vV31.4s, vU42.4s, vW22.4s\n"
+        "fmla vV32.4s, vU42.4s, vW21.4s\n"
+        "subs %x[c4_rem], %x[c4_rem], #1\n"
+        "fmla vV41.4s, vU42.4s, vW12.4s\n"
+        "fmla vV42.4s, vU42.4s, vW11.4s\n"
+        "ldr qU61, [uptr5], #0x10\n"
+        "fmla vV31.4s, vU52.4s, vW32.4s\n"
+        "fmla vV32.4s, vU52.4s, vW31.4s\n"
+        "str qV32, [vptr2, %x[v_col_stride1]]\n"
+        "fmla vV41.4s, vU52.4s, vW22.4s\n"
+        "fmla vV42.4s, vU52.4s, vW21.4s\n"
+        "fmla vV41.4s, vU62.4s, vW32.4s\n"
+        "fmla vV42.4s, vU62.4s, vW31.4s\n"
+        "str qV42, [vptr3, %x[v_col_stride1]]\n"
+        "fmla vV11.4s, vU11.4s, vW11.4s\n"
+        "fmla vV11.4s, vU21.4s, vW21.4s\n"
+        "fmla vV21.4s, vU21.4s, vW11.4s\n"
+        "fmla vV11.4s, vU31.4s, vW31.4s\n"
+        "str qV11, [%x[vptr0]], #0x10\n"
+        "fmla vV21.4s, vU31.4s, vW21.4s\n"
+        "fmla vV31.4s, vU31.4s, vW11.4s\n"
+        "fmla vV21.4s, vU41.4s, vW31.4s\n"
+        "str qV21, [vptr1], #0x10\n"
+        "fmla vV31.4s, vU41.4s, vW21.4s\n"
+        "fmla vV41.4s, vU41.4s, vW11.4s\n"
+        "fmla vV31.4s, vU51.4s, vW31.4s\n"
+        "str qV31, [vptr2], #0x10\n"
+        "fmla vV41.4s, vU51.4s, vW21.4s\n"
+        "fmla vV41.4s, vU61.4s, vW31.4s\n"
+        "str qV41, [vptr3], #0x10\n"
+
+      ".unreq qW22\n" ".unreq qU64\n" ".unreq qU35\n" ".unreq qV41\n"
+      ".unreq qU34\n" ".unreq qU21\n" ".unreq qV43\n" ".unreq qW21\n"
+      ".unreq qU24\n" ".unreq qU54\n" ".unreq qV31\n" ".unreq qV12\n"
+      ".unreq qU61\n" ".unreq qU26\n" ".unreq qV32\n"
+      ".unreq qU36\n" ".unreq qU51\n" ".unreq qU66\n" ".unreq qU12\n"
+      ".unreq qV14\n" ".unreq qV11\n" ".unreq qU65\n"
+      ".unreq qU15\n" ".unreq qU22\n" ".unreq qU45\n"
+      ".unreq qV22\n" ".unreq qU14\n"
+      ".unreq qU44\n" ".unreq qU43\n" ".unreq qU11\n"
+      ".unreq qV24\n" ".unreq qV42\n" ".unreq qW31\n" ".unreq qW13\n"
+      ".unreq qU33\n" ".unreq qU62\n" ".unreq qU25\n" ".unreq qU56\n"
+      ".unreq qW33\n"
+      ".unreq qU42\n" ".unreq qU16\n" ".unreq qV44\n"
+      ".unreq qU63\n" ".unreq qU31\n" ".unreq qV34\n"
+      ".unreq qW11\n" ".unreq qU41\n" ".unreq qV13\n" ".unreq qV33\n"
+      ".unreq qU46\n" ".unreq qU32\n" ".unreq qU13\n"
+      ".unreq qW23\n" ".unreq qV23\n" ".unreq qV21\n" ".unreq qU55\n"
+      ".unreq qW12\n" ".unreq qW32\n" ".unreq qU23\n" ".unreq qU52\n"
+      ".unreq qU53\n" ".unreq vW22\n"
+      ".unreq vU64\n" ".unreq vU35\n" ".unreq vV41\n"
+      ".unreq vU34\n" ".unreq vU21\n" ".unreq vV43\n" ".unreq vW21\n"
+      ".unreq vU24\n" ".unreq vU54\n" ".unreq vV31\n"
+      ".unreq vV12\n" ".unreq vU61\n"
+      ".unreq vU26\n" ".unreq vV32\n"
+      ".unreq vU36\n" ".unreq vU51\n" ".unreq vU66\n" ".unreq vU12\n"
+      ".unreq vV14\n" ".unreq vV11\n" ".unreq vU65\n"
+      ".unreq vU15\n" ".unreq vU22\n" ".unreq vU45\n"
+      ".unreq vV22\n" ".unreq vU14\n"
+      ".unreq vU44\n" ".unreq vU43\n" ".unreq vU11\n"
+      ".unreq vV24\n" ".unreq vV42\n" ".unreq vW31\n" ".unreq vW13\n"
+      ".unreq vU33\n" ".unreq vU62\n" ".unreq vU25\n" ".unreq vU56\n"
+      ".unreq vW33\n" ".unreq vU42\n" ".unreq vU16\n" ".unreq vV44\n"
+      ".unreq vU63\n" ".unreq vU31\n" ".unreq vV34\n" ".unreq vW11\n"
+      ".unreq vU41\n" ".unreq vV13\n" ".unreq vV33\n"
+      ".unreq vU46\n" ".unreq vU32\n" ".unreq vU13\n" ".unreq vW23\n"
+      ".unreq vV23\n" ".unreq vV21\n" ".unreq vU55\n" ".unreq vW12\n"
+      ".unreq vW32\n" ".unreq vU23\n" ".unreq vU52\n" ".unreq vU53\n"
+      : [uptr0] "+r" (uptr0), [vptr0] "+r" (vptr0), [wptr0] "+r" (wptr0),
+        [c4_rem] "+r" (c4_rem)
+      : [u_row_stride] "r" (in_row_stride * sizeof(float)),
+        [u_col_stride1] "r" (in_col_stride * sizeof(float)),
+        [v_row_stride] "r" (out_row_stride * sizeof(float)),
+        [v_col_stride1] "r" (out_col_stride * sizeof(float)),
+        [w_row_stride] "r" (weight_row_stride * sizeof(float)),
+        [w_col_stride1] "r" (weight_col_stride * sizeof(float))
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x0",
+        "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11",
+        "x12", "x13", "x14", "x15", "x16", "cc", "memory"
+    );
+  }
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Load input tile
+    float u[inner_tile_rows][inner_tile_cols];
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      const float* const inptr_row = uptr0 + (i - in_pad_top)*in_row_stride;
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        if (i < in_pad_top || in_cells_i <= i ||
+            j < in_pad_left || in_cells_j <= j)
+        {
+          u[i][j] = static_cast<float>(0);
+        }
+        else
+        {
+          u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
+        }
+      }
+    }
+    uptr0++;
+
+    // Load weights tile
+    float w[kernel_rows][kernel_cols];
+    for (int i = 0; i < kernel_rows; i++)
+    {
+      const float* const wptr_row = wptr0 + i*weight_row_stride;
+      for (int j = 0; j < kernel_cols; j++)
+      {
+        w[i][j] = *(wptr_row + j*weight_col_stride);
+      }
+    }
+    wptr0++;
+
+    // Perform the convolution
+    float v[output_tile_rows][output_tile_cols];
+    for (int out_i = 0; out_i < out_cells_i; out_i++)
+    {
+      for (int out_j = 0; out_j < out_cells_j; out_j++)
+      {
+        // Clear the accumulator
+        v[out_i][out_j] = static_cast<float>(0);
+
+        // Base co-ordinate
+        const int base_i = out_i * stride_rows;
+        const int base_j = out_j * stride_cols;
+
+        // Fill the accumulator
+        for (int in_i = 0; in_i < kernel_rows; in_i++)
+        {
+          const int i = base_i + in_i;
+          for (int in_j = 0; in_j < kernel_cols; in_j++)
+          {
+            const int j = base_j + in_j;
+            v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+          }
+        }
+      }
+    }
+
+    // Store the output tile
+    for (int i = 0; i < out_cells_i; i++)
+    {
+      float* const outptr_row = vptr0 + i*out_row_stride;
+      for (int j = 0; j < out_cells_j; j++)
+      {
+        *(outptr_row + j*out_col_stride) = v[i][j];
+      }
+    }
+    vptr0++;
+  }
+}
+
+#endif  // __aarch64__
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+  ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
 };
 
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+  ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+  },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+  },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
 
 template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>;
 }  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
index 2104c0b..8eb53a6 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp

@@ -29,5179 +29,138 @@
 using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, float, float>;
 
 template <>
-const Conv::TileFn Conv::tile_fns
-  [max_in_pad_top]
-  [max_in_pad_left]
-  [max_in_pad_bottom]
-  [max_in_pad_right]
-  [max_out_pad_bottom]
-  [max_out_pad_right] = {
-  {  // Input pad top = 0
-    {  // Input pad left = 0
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 0, 0, 2>,
-            Conv::template process_tile<0, 0, 0, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 0, 1, 2>,
-            Conv::template process_tile<0, 0, 0, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 0, 2, 2>,
-            Conv::template process_tile<0, 0, 0, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 0, 0, 3, 0>,
-            Conv::template process_tile<0, 0, 0, 0, 3, 1>,
-            Conv::template process_tile<0, 0, 0, 0, 3, 2>,
-            Conv::template process_tile<0, 0, 0, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 1, 0, 2>,
-            Conv::template process_tile<0, 0, 0, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 1, 1, 2>,
-            Conv::template process_tile<0, 0, 0, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 1, 2, 2>,
-            Conv::template process_tile<0, 0, 0, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 0, 1, 3, 0>,
-            Conv::template process_tile<0, 0, 0, 1, 3, 1>,
-            Conv::template process_tile<0, 0, 0, 1, 3, 2>,
-            Conv::template process_tile<0, 0, 0, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 2, 0, 2>,
-            Conv::template process_tile<0, 0, 0, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 2, 1, 2>,
-            Conv::template process_tile<0, 0, 0, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 2, 2, 2>,
-            Conv::template process_tile<0, 0, 0, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 0, 2, 3, 0>,
-            Conv::template process_tile<0, 0, 0, 2, 3, 1>,
-            Conv::template process_tile<0, 0, 0, 2, 3, 2>,
-            Conv::template process_tile<0, 0, 0, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 3, 0, 2>,
-            Conv::template process_tile<0, 0, 0, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 3, 1, 2>,
-            Conv::template process_tile<0, 0, 0, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 3, 2, 2>,
-            Conv::template process_tile<0, 0, 0, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 0, 3, 3, 0>,
-            Conv::template process_tile<0, 0, 0, 3, 3, 1>,
-            Conv::template process_tile<0, 0, 0, 3, 3, 2>,
-            Conv::template process_tile<0, 0, 0, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 4, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 4, 0, 2>,
-            Conv::template process_tile<0, 0, 0, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 4, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 4, 1, 2>,
-            Conv::template process_tile<0, 0, 0, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 4, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 4, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 4, 2, 2>,
-            Conv::template process_tile<0, 0, 0, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 0, 4, 3, 0>,
-            Conv::template process_tile<0, 0, 0, 4, 3, 1>,
-            Conv::template process_tile<0, 0, 0, 4, 3, 2>,
-            Conv::template process_tile<0, 0, 0, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 5, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 5, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 5, 0, 2>,
-            Conv::template process_tile<0, 0, 0, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 5, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 5, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 5, 1, 2>,
-            Conv::template process_tile<0, 0, 0, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 5, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 5, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 5, 2, 2>,
-            Conv::template process_tile<0, 0, 0, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 0, 5, 3, 0>,
-            Conv::template process_tile<0, 0, 0, 5, 3, 1>,
-            Conv::template process_tile<0, 0, 0, 5, 3, 2>,
-            Conv::template process_tile<0, 0, 0, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 0, 6, 0, 0>,
-            Conv::template process_tile<0, 0, 0, 6, 0, 1>,
-            Conv::template process_tile<0, 0, 0, 6, 0, 2>,
-            Conv::template process_tile<0, 0, 0, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 0, 6, 1, 0>,
-            Conv::template process_tile<0, 0, 0, 6, 1, 1>,
-            Conv::template process_tile<0, 0, 0, 6, 1, 2>,
-            Conv::template process_tile<0, 0, 0, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 0, 6, 2, 0>,
-            Conv::template process_tile<0, 0, 0, 6, 2, 1>,
-            Conv::template process_tile<0, 0, 0, 6, 2, 2>,
-            Conv::template process_tile<0, 0, 0, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 0, 6, 3, 0>,
-            Conv::template process_tile<0, 0, 0, 6, 3, 1>,
-            Conv::template process_tile<0, 0, 0, 6, 3, 2>,
-            Conv::template process_tile<0, 0, 0, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 0, 0, 2>,
-            Conv::template process_tile<0, 0, 1, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 0, 1, 2>,
-            Conv::template process_tile<0, 0, 1, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 0, 2, 2>,
-            Conv::template process_tile<0, 0, 1, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 1, 0, 3, 0>,
-            Conv::template process_tile<0, 0, 1, 0, 3, 1>,
-            Conv::template process_tile<0, 0, 1, 0, 3, 2>,
-            Conv::template process_tile<0, 0, 1, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 1, 0, 2>,
-            Conv::template process_tile<0, 0, 1, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 1, 1, 2>,
-            Conv::template process_tile<0, 0, 1, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 1, 2, 2>,
-            Conv::template process_tile<0, 0, 1, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 1, 1, 3, 0>,
-            Conv::template process_tile<0, 0, 1, 1, 3, 1>,
-            Conv::template process_tile<0, 0, 1, 1, 3, 2>,
-            Conv::template process_tile<0, 0, 1, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 2, 0, 2>,
-            Conv::template process_tile<0, 0, 1, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 2, 1, 2>,
-            Conv::template process_tile<0, 0, 1, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 2, 2, 2>,
-            Conv::template process_tile<0, 0, 1, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 1, 2, 3, 0>,
-            Conv::template process_tile<0, 0, 1, 2, 3, 1>,
-            Conv::template process_tile<0, 0, 1, 2, 3, 2>,
-            Conv::template process_tile<0, 0, 1, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 3, 0, 2>,
-            Conv::template process_tile<0, 0, 1, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 3, 1, 2>,
-            Conv::template process_tile<0, 0, 1, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 3, 2, 2>,
-            Conv::template process_tile<0, 0, 1, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 1, 3, 3, 0>,
-            Conv::template process_tile<0, 0, 1, 3, 3, 1>,
-            Conv::template process_tile<0, 0, 1, 3, 3, 2>,
-            Conv::template process_tile<0, 0, 1, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 4, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 4, 0, 2>,
-            Conv::template process_tile<0, 0, 1, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 4, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 4, 1, 2>,
-            Conv::template process_tile<0, 0, 1, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 4, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 4, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 4, 2, 2>,
-            Conv::template process_tile<0, 0, 1, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 1, 4, 3, 0>,
-            Conv::template process_tile<0, 0, 1, 4, 3, 1>,
-            Conv::template process_tile<0, 0, 1, 4, 3, 2>,
-            Conv::template process_tile<0, 0, 1, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 5, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 5, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 5, 0, 2>,
-            Conv::template process_tile<0, 0, 1, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 5, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 5, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 5, 1, 2>,
-            Conv::template process_tile<0, 0, 1, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 5, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 5, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 5, 2, 2>,
-            Conv::template process_tile<0, 0, 1, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 1, 5, 3, 0>,
-            Conv::template process_tile<0, 0, 1, 5, 3, 1>,
-            Conv::template process_tile<0, 0, 1, 5, 3, 2>,
-            Conv::template process_tile<0, 0, 1, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 1, 6, 0, 0>,
-            Conv::template process_tile<0, 0, 1, 6, 0, 1>,
-            Conv::template process_tile<0, 0, 1, 6, 0, 2>,
-            Conv::template process_tile<0, 0, 1, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 1, 6, 1, 0>,
-            Conv::template process_tile<0, 0, 1, 6, 1, 1>,
-            Conv::template process_tile<0, 0, 1, 6, 1, 2>,
-            Conv::template process_tile<0, 0, 1, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 1, 6, 2, 0>,
-            Conv::template process_tile<0, 0, 1, 6, 2, 1>,
-            Conv::template process_tile<0, 0, 1, 6, 2, 2>,
-            Conv::template process_tile<0, 0, 1, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 1, 6, 3, 0>,
-            Conv::template process_tile<0, 0, 1, 6, 3, 1>,
-            Conv::template process_tile<0, 0, 1, 6, 3, 2>,
-            Conv::template process_tile<0, 0, 1, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 0, 0, 2>,
-            Conv::template process_tile<0, 0, 2, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 0, 1, 2>,
-            Conv::template process_tile<0, 0, 2, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 0, 2, 2>,
-            Conv::template process_tile<0, 0, 2, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 2, 0, 3, 0>,
-            Conv::template process_tile<0, 0, 2, 0, 3, 1>,
-            Conv::template process_tile<0, 0, 2, 0, 3, 2>,
-            Conv::template process_tile<0, 0, 2, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 1, 0, 2>,
-            Conv::template process_tile<0, 0, 2, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 1, 1, 2>,
-            Conv::template process_tile<0, 0, 2, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 1, 2, 2>,
-            Conv::template process_tile<0, 0, 2, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 2, 1, 3, 0>,
-            Conv::template process_tile<0, 0, 2, 1, 3, 1>,
-            Conv::template process_tile<0, 0, 2, 1, 3, 2>,
-            Conv::template process_tile<0, 0, 2, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 2, 0, 2>,
-            Conv::template process_tile<0, 0, 2, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 2, 1, 2>,
-            Conv::template process_tile<0, 0, 2, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 2, 2, 2>,
-            Conv::template process_tile<0, 0, 2, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 2, 2, 3, 0>,
-            Conv::template process_tile<0, 0, 2, 2, 3, 1>,
-            Conv::template process_tile<0, 0, 2, 2, 3, 2>,
-            Conv::template process_tile<0, 0, 2, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 3, 0, 2>,
-            Conv::template process_tile<0, 0, 2, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 3, 1, 2>,
-            Conv::template process_tile<0, 0, 2, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 3, 2, 2>,
-            Conv::template process_tile<0, 0, 2, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 2, 3, 3, 0>,
-            Conv::template process_tile<0, 0, 2, 3, 3, 1>,
-            Conv::template process_tile<0, 0, 2, 3, 3, 2>,
-            Conv::template process_tile<0, 0, 2, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 4, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 4, 0, 2>,
-            Conv::template process_tile<0, 0, 2, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 4, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 4, 1, 2>,
-            Conv::template process_tile<0, 0, 2, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 4, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 4, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 4, 2, 2>,
-            Conv::template process_tile<0, 0, 2, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 2, 4, 3, 0>,
-            Conv::template process_tile<0, 0, 2, 4, 3, 1>,
-            Conv::template process_tile<0, 0, 2, 4, 3, 2>,
-            Conv::template process_tile<0, 0, 2, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 5, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 5, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 5, 0, 2>,
-            Conv::template process_tile<0, 0, 2, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 5, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 5, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 5, 1, 2>,
-            Conv::template process_tile<0, 0, 2, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 5, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 5, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 5, 2, 2>,
-            Conv::template process_tile<0, 0, 2, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 2, 5, 3, 0>,
-            Conv::template process_tile<0, 0, 2, 5, 3, 1>,
-            Conv::template process_tile<0, 0, 2, 5, 3, 2>,
-            Conv::template process_tile<0, 0, 2, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 2, 6, 0, 0>,
-            Conv::template process_tile<0, 0, 2, 6, 0, 1>,
-            Conv::template process_tile<0, 0, 2, 6, 0, 2>,
-            Conv::template process_tile<0, 0, 2, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 2, 6, 1, 0>,
-            Conv::template process_tile<0, 0, 2, 6, 1, 1>,
-            Conv::template process_tile<0, 0, 2, 6, 1, 2>,
-            Conv::template process_tile<0, 0, 2, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 2, 6, 2, 0>,
-            Conv::template process_tile<0, 0, 2, 6, 2, 1>,
-            Conv::template process_tile<0, 0, 2, 6, 2, 2>,
-            Conv::template process_tile<0, 0, 2, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 2, 6, 3, 0>,
-            Conv::template process_tile<0, 0, 2, 6, 3, 1>,
-            Conv::template process_tile<0, 0, 2, 6, 3, 2>,
-            Conv::template process_tile<0, 0, 2, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 0, 0, 2>,
-            Conv::template process_tile<0, 0, 3, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 0, 1, 2>,
-            Conv::template process_tile<0, 0, 3, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 0, 2, 2>,
-            Conv::template process_tile<0, 0, 3, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 3, 0, 3, 0>,
-            Conv::template process_tile<0, 0, 3, 0, 3, 1>,
-            Conv::template process_tile<0, 0, 3, 0, 3, 2>,
-            Conv::template process_tile<0, 0, 3, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 1, 0, 2>,
-            Conv::template process_tile<0, 0, 3, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 1, 1, 2>,
-            Conv::template process_tile<0, 0, 3, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 1, 2, 2>,
-            Conv::template process_tile<0, 0, 3, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 3, 1, 3, 0>,
-            Conv::template process_tile<0, 0, 3, 1, 3, 1>,
-            Conv::template process_tile<0, 0, 3, 1, 3, 2>,
-            Conv::template process_tile<0, 0, 3, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 2, 0, 2>,
-            Conv::template process_tile<0, 0, 3, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 2, 1, 2>,
-            Conv::template process_tile<0, 0, 3, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 2, 2, 2>,
-            Conv::template process_tile<0, 0, 3, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 3, 2, 3, 0>,
-            Conv::template process_tile<0, 0, 3, 2, 3, 1>,
-            Conv::template process_tile<0, 0, 3, 2, 3, 2>,
-            Conv::template process_tile<0, 0, 3, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 3, 0, 2>,
-            Conv::template process_tile<0, 0, 3, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 3, 1, 2>,
-            Conv::template process_tile<0, 0, 3, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 3, 2, 2>,
-            Conv::template process_tile<0, 0, 3, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 3, 3, 3, 0>,
-            Conv::template process_tile<0, 0, 3, 3, 3, 1>,
-            Conv::template process_tile<0, 0, 3, 3, 3, 2>,
-            Conv::template process_tile<0, 0, 3, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 4, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 4, 0, 2>,
-            Conv::template process_tile<0, 0, 3, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 4, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 4, 1, 2>,
-            Conv::template process_tile<0, 0, 3, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 4, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 4, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 4, 2, 2>,
-            Conv::template process_tile<0, 0, 3, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 3, 4, 3, 0>,
-            Conv::template process_tile<0, 0, 3, 4, 3, 1>,
-            Conv::template process_tile<0, 0, 3, 4, 3, 2>,
-            Conv::template process_tile<0, 0, 3, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 5, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 5, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 5, 0, 2>,
-            Conv::template process_tile<0, 0, 3, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 5, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 5, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 5, 1, 2>,
-            Conv::template process_tile<0, 0, 3, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 5, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 5, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 5, 2, 2>,
-            Conv::template process_tile<0, 0, 3, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 3, 5, 3, 0>,
-            Conv::template process_tile<0, 0, 3, 5, 3, 1>,
-            Conv::template process_tile<0, 0, 3, 5, 3, 2>,
-            Conv::template process_tile<0, 0, 3, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 3, 6, 0, 0>,
-            Conv::template process_tile<0, 0, 3, 6, 0, 1>,
-            Conv::template process_tile<0, 0, 3, 6, 0, 2>,
-            Conv::template process_tile<0, 0, 3, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 3, 6, 1, 0>,
-            Conv::template process_tile<0, 0, 3, 6, 1, 1>,
-            Conv::template process_tile<0, 0, 3, 6, 1, 2>,
-            Conv::template process_tile<0, 0, 3, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 3, 6, 2, 0>,
-            Conv::template process_tile<0, 0, 3, 6, 2, 1>,
-            Conv::template process_tile<0, 0, 3, 6, 2, 2>,
-            Conv::template process_tile<0, 0, 3, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 3, 6, 3, 0>,
-            Conv::template process_tile<0, 0, 3, 6, 3, 1>,
-            Conv::template process_tile<0, 0, 3, 6, 3, 2>,
-            Conv::template process_tile<0, 0, 3, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 3
-      {  // Input pad bottom = 4
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 4, 0, 0, 2>,
-            Conv::template process_tile<0, 0, 4, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 4, 0, 1, 2>,
-            Conv::template process_tile<0, 0, 4, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 4, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 4, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 4, 0, 2, 2>,
-            Conv::template process_tile<0, 0, 4, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 4, 0, 3, 0>,
-            Conv::template process_tile<0, 0, 4, 0, 3, 1>,
-            Conv::template process_tile<0, 0, 4, 0, 3, 2>,
-            Conv::template process_tile<0, 0, 4, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 4, 1, 0, 2>,
-            Conv::template process_tile<0, 0, 4, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 4, 1, 1, 2>,
-            Conv::template process_tile<0, 0, 4, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 4, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 4, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 4, 1, 2, 2>,
-            Conv::template process_tile<0, 0, 4, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 4, 1, 3, 0>,
-            Conv::template process_tile<0, 0, 4, 1, 3, 1>,
-            Conv::template process_tile<0, 0, 4, 1, 3, 2>,
-            Conv::template process_tile<0, 0, 4, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 4, 2, 0, 2>,
-            Conv::template process_tile<0, 0, 4, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 4, 2, 1, 2>,
-            Conv::template process_tile<0, 0, 4, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 4, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 4, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 4, 2, 2, 2>,
-            Conv::template process_tile<0, 0, 4, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 4, 2, 3, 0>,
-            Conv::template process_tile<0, 0, 4, 2, 3, 1>,
-            Conv::template process_tile<0, 0, 4, 2, 3, 2>,
-            Conv::template process_tile<0, 0, 4, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 4, 3, 0, 2>,
-            Conv::template process_tile<0, 0, 4, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 4, 3, 1, 2>,
-            Conv::template process_tile<0, 0, 4, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 4, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 4, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 4, 3, 2, 2>,
-            Conv::template process_tile<0, 0, 4, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 4, 3, 3, 0>,
-            Conv::template process_tile<0, 0, 4, 3, 3, 1>,
-            Conv::template process_tile<0, 0, 4, 3, 3, 2>,
-            Conv::template process_tile<0, 0, 4, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 4, 0, 1>,
-            Conv::template process_tile<0, 0, 4, 4, 0, 2>,
-            Conv::template process_tile<0, 0, 4, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 4, 1, 1>,
-            Conv::template process_tile<0, 0, 4, 4, 1, 2>,
-            Conv::template process_tile<0, 0, 4, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 4, 4, 2, 0>,
-            Conv::template process_tile<0, 0, 4, 4, 2, 1>,
-            Conv::template process_tile<0, 0, 4, 4, 2, 2>,
-            Conv::template process_tile<0, 0, 4, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 4, 4, 3, 0>,
-            Conv::template process_tile<0, 0, 4, 4, 3, 1>,
-            Conv::template process_tile<0, 0, 4, 4, 3, 2>,
-            Conv::template process_tile<0, 0, 4, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 5, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 5, 0, 1>,
-            Conv::template process_tile<0, 0, 4, 5, 0, 2>,
-            Conv::template process_tile<0, 0, 4, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 5, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 5, 1, 1>,
-            Conv::template process_tile<0, 0, 4, 5, 1, 2>,
-            Conv::template process_tile<0, 0, 4, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 4, 5, 2, 0>,
-            Conv::template process_tile<0, 0, 4, 5, 2, 1>,
-            Conv::template process_tile<0, 0, 4, 5, 2, 2>,
-            Conv::template process_tile<0, 0, 4, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 4, 5, 3, 0>,
-            Conv::template process_tile<0, 0, 4, 5, 3, 1>,
-            Conv::template process_tile<0, 0, 4, 5, 3, 2>,
-            Conv::template process_tile<0, 0, 4, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 4, 6, 0, 0>,
-            Conv::template process_tile<0, 0, 4, 6, 0, 1>,
-            Conv::template process_tile<0, 0, 4, 6, 0, 2>,
-            Conv::template process_tile<0, 0, 4, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 4, 6, 1, 0>,
-            Conv::template process_tile<0, 0, 4, 6, 1, 1>,
-            Conv::template process_tile<0, 0, 4, 6, 1, 2>,
-            Conv::template process_tile<0, 0, 4, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 4, 6, 2, 0>,
-            Conv::template process_tile<0, 0, 4, 6, 2, 1>,
-            Conv::template process_tile<0, 0, 4, 6, 2, 2>,
-            Conv::template process_tile<0, 0, 4, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 4, 6, 3, 0>,
-            Conv::template process_tile<0, 0, 4, 6, 3, 1>,
-            Conv::template process_tile<0, 0, 4, 6, 3, 2>,
-            Conv::template process_tile<0, 0, 4, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 4
-      {  // Input pad bottom = 5
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 5, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 5, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 5, 0, 0, 2>,
-            Conv::template process_tile<0, 0, 5, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 5, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 5, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 5, 0, 1, 2>,
-            Conv::template process_tile<0, 0, 5, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 5, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 5, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 5, 0, 2, 2>,
-            Conv::template process_tile<0, 0, 5, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 5, 0, 3, 0>,
-            Conv::template process_tile<0, 0, 5, 0, 3, 1>,
-            Conv::template process_tile<0, 0, 5, 0, 3, 2>,
-            Conv::template process_tile<0, 0, 5, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 5, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 5, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 5, 1, 0, 2>,
-            Conv::template process_tile<0, 0, 5, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 5, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 5, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 5, 1, 1, 2>,
-            Conv::template process_tile<0, 0, 5, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 5, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 5, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 5, 1, 2, 2>,
-            Conv::template process_tile<0, 0, 5, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 5, 1, 3, 0>,
-            Conv::template process_tile<0, 0, 5, 1, 3, 1>,
-            Conv::template process_tile<0, 0, 5, 1, 3, 2>,
-            Conv::template process_tile<0, 0, 5, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 5, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 5, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 5, 2, 0, 2>,
-            Conv::template process_tile<0, 0, 5, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 5, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 5, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 5, 2, 1, 2>,
-            Conv::template process_tile<0, 0, 5, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 5, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 5, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 5, 2, 2, 2>,
-            Conv::template process_tile<0, 0, 5, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 5, 2, 3, 0>,
-            Conv::template process_tile<0, 0, 5, 2, 3, 1>,
-            Conv::template process_tile<0, 0, 5, 2, 3, 2>,
-            Conv::template process_tile<0, 0, 5, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 5, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 5, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 5, 3, 0, 2>,
-            Conv::template process_tile<0, 0, 5, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 5, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 5, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 5, 3, 1, 2>,
-            Conv::template process_tile<0, 0, 5, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 5, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 5, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 5, 3, 2, 2>,
-            Conv::template process_tile<0, 0, 5, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 5, 3, 3, 0>,
-            Conv::template process_tile<0, 0, 5, 3, 3, 1>,
-            Conv::template process_tile<0, 0, 5, 3, 3, 2>,
-            Conv::template process_tile<0, 0, 5, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 5, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 5, 4, 0, 1>,
-            Conv::template process_tile<0, 0, 5, 4, 0, 2>,
-            Conv::template process_tile<0, 0, 5, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 5, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 5, 4, 1, 1>,
-            Conv::template process_tile<0, 0, 5, 4, 1, 2>,
-            Conv::template process_tile<0, 0, 5, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 5, 4, 2, 0>,
-            Conv::template process_tile<0, 0, 5, 4, 2, 1>,
-            Conv::template process_tile<0, 0, 5, 4, 2, 2>,
-            Conv::template process_tile<0, 0, 5, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 5, 4, 3, 0>,
-            Conv::template process_tile<0, 0, 5, 4, 3, 1>,
-            Conv::template process_tile<0, 0, 5, 4, 3, 2>,
-            Conv::template process_tile<0, 0, 5, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 5, 5, 0, 0>,
-            Conv::template process_tile<0, 0, 5, 5, 0, 1>,
-            Conv::template process_tile<0, 0, 5, 5, 0, 2>,
-            Conv::template process_tile<0, 0, 5, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 5, 5, 1, 0>,
-            Conv::template process_tile<0, 0, 5, 5, 1, 1>,
-            Conv::template process_tile<0, 0, 5, 5, 1, 2>,
-            Conv::template process_tile<0, 0, 5, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 5, 5, 2, 0>,
-            Conv::template process_tile<0, 0, 5, 5, 2, 1>,
-            Conv::template process_tile<0, 0, 5, 5, 2, 2>,
-            Conv::template process_tile<0, 0, 5, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 5, 5, 3, 0>,
-            Conv::template process_tile<0, 0, 5, 5, 3, 1>,
-            Conv::template process_tile<0, 0, 5, 5, 3, 2>,
-            Conv::template process_tile<0, 0, 5, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 5, 6, 0, 0>,
-            Conv::template process_tile<0, 0, 5, 6, 0, 1>,
-            Conv::template process_tile<0, 0, 5, 6, 0, 2>,
-            Conv::template process_tile<0, 0, 5, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 5, 6, 1, 0>,
-            Conv::template process_tile<0, 0, 5, 6, 1, 1>,
-            Conv::template process_tile<0, 0, 5, 6, 1, 2>,
-            Conv::template process_tile<0, 0, 5, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 5, 6, 2, 0>,
-            Conv::template process_tile<0, 0, 5, 6, 2, 1>,
-            Conv::template process_tile<0, 0, 5, 6, 2, 2>,
-            Conv::template process_tile<0, 0, 5, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 5, 6, 3, 0>,
-            Conv::template process_tile<0, 0, 5, 6, 3, 1>,
-            Conv::template process_tile<0, 0, 5, 6, 3, 2>,
-            Conv::template process_tile<0, 0, 5, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 5
-      {  // Input pad bottom = 6
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 6, 0, 0, 0>,
-            Conv::template process_tile<0, 0, 6, 0, 0, 1>,
-            Conv::template process_tile<0, 0, 6, 0, 0, 2>,
-            Conv::template process_tile<0, 0, 6, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 6, 0, 1, 0>,
-            Conv::template process_tile<0, 0, 6, 0, 1, 1>,
-            Conv::template process_tile<0, 0, 6, 0, 1, 2>,
-            Conv::template process_tile<0, 0, 6, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 6, 0, 2, 0>,
-            Conv::template process_tile<0, 0, 6, 0, 2, 1>,
-            Conv::template process_tile<0, 0, 6, 0, 2, 2>,
-            Conv::template process_tile<0, 0, 6, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 6, 0, 3, 0>,
-            Conv::template process_tile<0, 0, 6, 0, 3, 1>,
-            Conv::template process_tile<0, 0, 6, 0, 3, 2>,
-            Conv::template process_tile<0, 0, 6, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 6, 1, 0, 0>,
-            Conv::template process_tile<0, 0, 6, 1, 0, 1>,
-            Conv::template process_tile<0, 0, 6, 1, 0, 2>,
-            Conv::template process_tile<0, 0, 6, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 6, 1, 1, 0>,
-            Conv::template process_tile<0, 0, 6, 1, 1, 1>,
-            Conv::template process_tile<0, 0, 6, 1, 1, 2>,
-            Conv::template process_tile<0, 0, 6, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 6, 1, 2, 0>,
-            Conv::template process_tile<0, 0, 6, 1, 2, 1>,
-            Conv::template process_tile<0, 0, 6, 1, 2, 2>,
-            Conv::template process_tile<0, 0, 6, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 6, 1, 3, 0>,
-            Conv::template process_tile<0, 0, 6, 1, 3, 1>,
-            Conv::template process_tile<0, 0, 6, 1, 3, 2>,
-            Conv::template process_tile<0, 0, 6, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 6, 2, 0, 0>,
-            Conv::template process_tile<0, 0, 6, 2, 0, 1>,
-            Conv::template process_tile<0, 0, 6, 2, 0, 2>,
-            Conv::template process_tile<0, 0, 6, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 6, 2, 1, 0>,
-            Conv::template process_tile<0, 0, 6, 2, 1, 1>,
-            Conv::template process_tile<0, 0, 6, 2, 1, 2>,
-            Conv::template process_tile<0, 0, 6, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 6, 2, 2, 0>,
-            Conv::template process_tile<0, 0, 6, 2, 2, 1>,
-            Conv::template process_tile<0, 0, 6, 2, 2, 2>,
-            Conv::template process_tile<0, 0, 6, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 6, 2, 3, 0>,
-            Conv::template process_tile<0, 0, 6, 2, 3, 1>,
-            Conv::template process_tile<0, 0, 6, 2, 3, 2>,
-            Conv::template process_tile<0, 0, 6, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 6, 3, 0, 0>,
-            Conv::template process_tile<0, 0, 6, 3, 0, 1>,
-            Conv::template process_tile<0, 0, 6, 3, 0, 2>,
-            Conv::template process_tile<0, 0, 6, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 6, 3, 1, 0>,
-            Conv::template process_tile<0, 0, 6, 3, 1, 1>,
-            Conv::template process_tile<0, 0, 6, 3, 1, 2>,
-            Conv::template process_tile<0, 0, 6, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 6, 3, 2, 0>,
-            Conv::template process_tile<0, 0, 6, 3, 2, 1>,
-            Conv::template process_tile<0, 0, 6, 3, 2, 2>,
-            Conv::template process_tile<0, 0, 6, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 6, 3, 3, 0>,
-            Conv::template process_tile<0, 0, 6, 3, 3, 1>,
-            Conv::template process_tile<0, 0, 6, 3, 3, 2>,
-            Conv::template process_tile<0, 0, 6, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 6, 4, 0, 0>,
-            Conv::template process_tile<0, 0, 6, 4, 0, 1>,
-            Conv::template process_tile<0, 0, 6, 4, 0, 2>,
-            Conv::template process_tile<0, 0, 6, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 6, 4, 1, 0>,
-            Conv::template process_tile<0, 0, 6, 4, 1, 1>,
-            Conv::template process_tile<0, 0, 6, 4, 1, 2>,
-            Conv::template process_tile<0, 0, 6, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 6, 4, 2, 0>,
-            Conv::template process_tile<0, 0, 6, 4, 2, 1>,
-            Conv::template process_tile<0, 0, 6, 4, 2, 2>,
-            Conv::template process_tile<0, 0, 6, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 6, 4, 3, 0>,
-            Conv::template process_tile<0, 0, 6, 4, 3, 1>,
-            Conv::template process_tile<0, 0, 6, 4, 3, 2>,
-            Conv::template process_tile<0, 0, 6, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 6, 5, 0, 0>,
-            Conv::template process_tile<0, 0, 6, 5, 0, 1>,
-            Conv::template process_tile<0, 0, 6, 5, 0, 2>,
-            Conv::template process_tile<0, 0, 6, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 6, 5, 1, 0>,
-            Conv::template process_tile<0, 0, 6, 5, 1, 1>,
-            Conv::template process_tile<0, 0, 6, 5, 1, 2>,
-            Conv::template process_tile<0, 0, 6, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 6, 5, 2, 0>,
-            Conv::template process_tile<0, 0, 6, 5, 2, 1>,
-            Conv::template process_tile<0, 0, 6, 5, 2, 2>,
-            Conv::template process_tile<0, 0, 6, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 6, 5, 3, 0>,
-            Conv::template process_tile<0, 0, 6, 5, 3, 1>,
-            Conv::template process_tile<0, 0, 6, 5, 3, 2>,
-            Conv::template process_tile<0, 0, 6, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 0, 6, 6, 0, 0>,
-            Conv::template process_tile<0, 0, 6, 6, 0, 1>,
-            Conv::template process_tile<0, 0, 6, 6, 0, 2>,
-            Conv::template process_tile<0, 0, 6, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 0, 6, 6, 1, 0>,
-            Conv::template process_tile<0, 0, 6, 6, 1, 1>,
-            Conv::template process_tile<0, 0, 6, 6, 1, 2>,
-            Conv::template process_tile<0, 0, 6, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 0, 6, 6, 2, 0>,
-            Conv::template process_tile<0, 0, 6, 6, 2, 1>,
-            Conv::template process_tile<0, 0, 6, 6, 2, 2>,
-            Conv::template process_tile<0, 0, 6, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 0, 6, 6, 3, 0>,
-            Conv::template process_tile<0, 0, 6, 6, 3, 1>,
-            Conv::template process_tile<0, 0, 6, 6, 3, 2>,
-            Conv::template process_tile<0, 0, 6, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 6
-    },  // Input pad left = 0
-    {  // Input pad left = 1
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 0, 0, 2>,
-            Conv::template process_tile<0, 1, 0, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 0, 1, 2>,
-            Conv::template process_tile<0, 1, 0, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 0, 2, 2>,
-            Conv::template process_tile<0, 1, 0, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 0, 0, 3, 0>,
-            Conv::template process_tile<0, 1, 0, 0, 3, 1>,
-            Conv::template process_tile<0, 1, 0, 0, 3, 2>,
-            Conv::template process_tile<0, 1, 0, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 1, 0, 2>,
-            Conv::template process_tile<0, 1, 0, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 1, 1, 2>,
-            Conv::template process_tile<0, 1, 0, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 1, 2, 2>,
-            Conv::template process_tile<0, 1, 0, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 0, 1, 3, 0>,
-            Conv::template process_tile<0, 1, 0, 1, 3, 1>,
-            Conv::template process_tile<0, 1, 0, 1, 3, 2>,
-            Conv::template process_tile<0, 1, 0, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 2, 0, 2>,
-            Conv::template process_tile<0, 1, 0, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 2, 1, 2>,
-            Conv::template process_tile<0, 1, 0, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 2, 2, 2>,
-            Conv::template process_tile<0, 1, 0, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 0, 2, 3, 0>,
-            Conv::template process_tile<0, 1, 0, 2, 3, 1>,
-            Conv::template process_tile<0, 1, 0, 2, 3, 2>,
-            Conv::template process_tile<0, 1, 0, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 3, 0, 2>,
-            Conv::template process_tile<0, 1, 0, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 3, 1, 2>,
-            Conv::template process_tile<0, 1, 0, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 3, 2, 2>,
-            Conv::template process_tile<0, 1, 0, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 0, 3, 3, 0>,
-            Conv::template process_tile<0, 1, 0, 3, 3, 1>,
-            Conv::template process_tile<0, 1, 0, 3, 3, 2>,
-            Conv::template process_tile<0, 1, 0, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 4, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 4, 0, 2>,
-            Conv::template process_tile<0, 1, 0, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 4, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 4, 1, 2>,
-            Conv::template process_tile<0, 1, 0, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 4, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 4, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 4, 2, 2>,
-            Conv::template process_tile<0, 1, 0, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 0, 4, 3, 0>,
-            Conv::template process_tile<0, 1, 0, 4, 3, 1>,
-            Conv::template process_tile<0, 1, 0, 4, 3, 2>,
-            Conv::template process_tile<0, 1, 0, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 5, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 5, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 5, 0, 2>,
-            Conv::template process_tile<0, 1, 0, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 5, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 5, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 5, 1, 2>,
-            Conv::template process_tile<0, 1, 0, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 5, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 5, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 5, 2, 2>,
-            Conv::template process_tile<0, 1, 0, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 0, 5, 3, 0>,
-            Conv::template process_tile<0, 1, 0, 5, 3, 1>,
-            Conv::template process_tile<0, 1, 0, 5, 3, 2>,
-            Conv::template process_tile<0, 1, 0, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 0, 6, 0, 0>,
-            Conv::template process_tile<0, 1, 0, 6, 0, 1>,
-            Conv::template process_tile<0, 1, 0, 6, 0, 2>,
-            Conv::template process_tile<0, 1, 0, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 0, 6, 1, 0>,
-            Conv::template process_tile<0, 1, 0, 6, 1, 1>,
-            Conv::template process_tile<0, 1, 0, 6, 1, 2>,
-            Conv::template process_tile<0, 1, 0, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 0, 6, 2, 0>,
-            Conv::template process_tile<0, 1, 0, 6, 2, 1>,
-            Conv::template process_tile<0, 1, 0, 6, 2, 2>,
-            Conv::template process_tile<0, 1, 0, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 0, 6, 3, 0>,
-            Conv::template process_tile<0, 1, 0, 6, 3, 1>,
-            Conv::template process_tile<0, 1, 0, 6, 3, 2>,
-            Conv::template process_tile<0, 1, 0, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 0, 0, 2>,
-            Conv::template process_tile<0, 1, 1, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 0, 1, 2>,
-            Conv::template process_tile<0, 1, 1, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 0, 2, 2>,
-            Conv::template process_tile<0, 1, 1, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 1, 0, 3, 0>,
-            Conv::template process_tile<0, 1, 1, 0, 3, 1>,
-            Conv::template process_tile<0, 1, 1, 0, 3, 2>,
-            Conv::template process_tile<0, 1, 1, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 1, 0, 2>,
-            Conv::template process_tile<0, 1, 1, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 1, 1, 2>,
-            Conv::template process_tile<0, 1, 1, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 1, 2, 2>,
-            Conv::template process_tile<0, 1, 1, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 1, 1, 3, 0>,
-            Conv::template process_tile<0, 1, 1, 1, 3, 1>,
-            Conv::template process_tile<0, 1, 1, 1, 3, 2>,
-            Conv::template process_tile<0, 1, 1, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 2, 0, 2>,
-            Conv::template process_tile<0, 1, 1, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 2, 1, 2>,
-            Conv::template process_tile<0, 1, 1, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 2, 2, 2>,
-            Conv::template process_tile<0, 1, 1, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 1, 2, 3, 0>,
-            Conv::template process_tile<0, 1, 1, 2, 3, 1>,
-            Conv::template process_tile<0, 1, 1, 2, 3, 2>,
-            Conv::template process_tile<0, 1, 1, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 3, 0, 2>,
-            Conv::template process_tile<0, 1, 1, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 3, 1, 2>,
-            Conv::template process_tile<0, 1, 1, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 3, 2, 2>,
-            Conv::template process_tile<0, 1, 1, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 1, 3, 3, 0>,
-            Conv::template process_tile<0, 1, 1, 3, 3, 1>,
-            Conv::template process_tile<0, 1, 1, 3, 3, 2>,
-            Conv::template process_tile<0, 1, 1, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 4, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 4, 0, 2>,
-            Conv::template process_tile<0, 1, 1, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 4, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 4, 1, 2>,
-            Conv::template process_tile<0, 1, 1, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 4, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 4, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 4, 2, 2>,
-            Conv::template process_tile<0, 1, 1, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 1, 4, 3, 0>,
-            Conv::template process_tile<0, 1, 1, 4, 3, 1>,
-            Conv::template process_tile<0, 1, 1, 4, 3, 2>,
-            Conv::template process_tile<0, 1, 1, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 5, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 5, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 5, 0, 2>,
-            Conv::template process_tile<0, 1, 1, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 5, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 5, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 5, 1, 2>,
-            Conv::template process_tile<0, 1, 1, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 5, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 5, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 5, 2, 2>,
-            Conv::template process_tile<0, 1, 1, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 1, 5, 3, 0>,
-            Conv::template process_tile<0, 1, 1, 5, 3, 1>,
-            Conv::template process_tile<0, 1, 1, 5, 3, 2>,
-            Conv::template process_tile<0, 1, 1, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 1, 6, 0, 0>,
-            Conv::template process_tile<0, 1, 1, 6, 0, 1>,
-            Conv::template process_tile<0, 1, 1, 6, 0, 2>,
-            Conv::template process_tile<0, 1, 1, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 1, 6, 1, 0>,
-            Conv::template process_tile<0, 1, 1, 6, 1, 1>,
-            Conv::template process_tile<0, 1, 1, 6, 1, 2>,
-            Conv::template process_tile<0, 1, 1, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 1, 6, 2, 0>,
-            Conv::template process_tile<0, 1, 1, 6, 2, 1>,
-            Conv::template process_tile<0, 1, 1, 6, 2, 2>,
-            Conv::template process_tile<0, 1, 1, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 1, 6, 3, 0>,
-            Conv::template process_tile<0, 1, 1, 6, 3, 1>,
-            Conv::template process_tile<0, 1, 1, 6, 3, 2>,
-            Conv::template process_tile<0, 1, 1, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 0, 0, 2>,
-            Conv::template process_tile<0, 1, 2, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 0, 1, 2>,
-            Conv::template process_tile<0, 1, 2, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 0, 2, 2>,
-            Conv::template process_tile<0, 1, 2, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 2, 0, 3, 0>,
-            Conv::template process_tile<0, 1, 2, 0, 3, 1>,
-            Conv::template process_tile<0, 1, 2, 0, 3, 2>,
-            Conv::template process_tile<0, 1, 2, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 1, 0, 2>,
-            Conv::template process_tile<0, 1, 2, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 1, 1, 2>,
-            Conv::template process_tile<0, 1, 2, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 1, 2, 2>,
-            Conv::template process_tile<0, 1, 2, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 2, 1, 3, 0>,
-            Conv::template process_tile<0, 1, 2, 1, 3, 1>,
-            Conv::template process_tile<0, 1, 2, 1, 3, 2>,
-            Conv::template process_tile<0, 1, 2, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 2, 0, 2>,
-            Conv::template process_tile<0, 1, 2, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 2, 1, 2>,
-            Conv::template process_tile<0, 1, 2, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 2, 2, 2>,
-            Conv::template process_tile<0, 1, 2, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 2, 2, 3, 0>,
-            Conv::template process_tile<0, 1, 2, 2, 3, 1>,
-            Conv::template process_tile<0, 1, 2, 2, 3, 2>,
-            Conv::template process_tile<0, 1, 2, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 3, 0, 2>,
-            Conv::template process_tile<0, 1, 2, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 3, 1, 2>,
-            Conv::template process_tile<0, 1, 2, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 3, 2, 2>,
-            Conv::template process_tile<0, 1, 2, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 2, 3, 3, 0>,
-            Conv::template process_tile<0, 1, 2, 3, 3, 1>,
-            Conv::template process_tile<0, 1, 2, 3, 3, 2>,
-            Conv::template process_tile<0, 1, 2, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 4, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 4, 0, 2>,
-            Conv::template process_tile<0, 1, 2, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 4, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 4, 1, 2>,
-            Conv::template process_tile<0, 1, 2, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 4, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 4, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 4, 2, 2>,
-            Conv::template process_tile<0, 1, 2, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 2, 4, 3, 0>,
-            Conv::template process_tile<0, 1, 2, 4, 3, 1>,
-            Conv::template process_tile<0, 1, 2, 4, 3, 2>,
-            Conv::template process_tile<0, 1, 2, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 5, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 5, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 5, 0, 2>,
-            Conv::template process_tile<0, 1, 2, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 5, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 5, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 5, 1, 2>,
-            Conv::template process_tile<0, 1, 2, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 5, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 5, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 5, 2, 2>,
-            Conv::template process_tile<0, 1, 2, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 2, 5, 3, 0>,
-            Conv::template process_tile<0, 1, 2, 5, 3, 1>,
-            Conv::template process_tile<0, 1, 2, 5, 3, 2>,
-            Conv::template process_tile<0, 1, 2, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 2, 6, 0, 0>,
-            Conv::template process_tile<0, 1, 2, 6, 0, 1>,
-            Conv::template process_tile<0, 1, 2, 6, 0, 2>,
-            Conv::template process_tile<0, 1, 2, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 2, 6, 1, 0>,
-            Conv::template process_tile<0, 1, 2, 6, 1, 1>,
-            Conv::template process_tile<0, 1, 2, 6, 1, 2>,
-            Conv::template process_tile<0, 1, 2, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 2, 6, 2, 0>,
-            Conv::template process_tile<0, 1, 2, 6, 2, 1>,
-            Conv::template process_tile<0, 1, 2, 6, 2, 2>,
-            Conv::template process_tile<0, 1, 2, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 2, 6, 3, 0>,
-            Conv::template process_tile<0, 1, 2, 6, 3, 1>,
-            Conv::template process_tile<0, 1, 2, 6, 3, 2>,
-            Conv::template process_tile<0, 1, 2, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 0, 0, 2>,
-            Conv::template process_tile<0, 1, 3, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 0, 1, 2>,
-            Conv::template process_tile<0, 1, 3, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 0, 2, 2>,
-            Conv::template process_tile<0, 1, 3, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 3, 0, 3, 0>,
-            Conv::template process_tile<0, 1, 3, 0, 3, 1>,
-            Conv::template process_tile<0, 1, 3, 0, 3, 2>,
-            Conv::template process_tile<0, 1, 3, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 1, 0, 2>,
-            Conv::template process_tile<0, 1, 3, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 1, 1, 2>,
-            Conv::template process_tile<0, 1, 3, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 1, 2, 2>,
-            Conv::template process_tile<0, 1, 3, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 3, 1, 3, 0>,
-            Conv::template process_tile<0, 1, 3, 1, 3, 1>,
-            Conv::template process_tile<0, 1, 3, 1, 3, 2>,
-            Conv::template process_tile<0, 1, 3, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 2, 0, 2>,
-            Conv::template process_tile<0, 1, 3, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 2, 1, 2>,
-            Conv::template process_tile<0, 1, 3, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 2, 2, 2>,
-            Conv::template process_tile<0, 1, 3, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 3, 2, 3, 0>,
-            Conv::template process_tile<0, 1, 3, 2, 3, 1>,
-            Conv::template process_tile<0, 1, 3, 2, 3, 2>,
-            Conv::template process_tile<0, 1, 3, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 3, 0, 2>,
-            Conv::template process_tile<0, 1, 3, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 3, 1, 2>,
-            Conv::template process_tile<0, 1, 3, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 3, 2, 2>,
-            Conv::template process_tile<0, 1, 3, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 3, 3, 3, 0>,
-            Conv::template process_tile<0, 1, 3, 3, 3, 1>,
-            Conv::template process_tile<0, 1, 3, 3, 3, 2>,
-            Conv::template process_tile<0, 1, 3, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 4, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 4, 0, 2>,
-            Conv::template process_tile<0, 1, 3, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 4, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 4, 1, 2>,
-            Conv::template process_tile<0, 1, 3, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 4, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 4, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 4, 2, 2>,
-            Conv::template process_tile<0, 1, 3, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 3, 4, 3, 0>,
-            Conv::template process_tile<0, 1, 3, 4, 3, 1>,
-            Conv::template process_tile<0, 1, 3, 4, 3, 2>,
-            Conv::template process_tile<0, 1, 3, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 5, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 5, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 5, 0, 2>,
-            Conv::template process_tile<0, 1, 3, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 5, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 5, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 5, 1, 2>,
-            Conv::template process_tile<0, 1, 3, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 5, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 5, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 5, 2, 2>,
-            Conv::template process_tile<0, 1, 3, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 3, 5, 3, 0>,
-            Conv::template process_tile<0, 1, 3, 5, 3, 1>,
-            Conv::template process_tile<0, 1, 3, 5, 3, 2>,
-            Conv::template process_tile<0, 1, 3, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 3, 6, 0, 0>,
-            Conv::template process_tile<0, 1, 3, 6, 0, 1>,
-            Conv::template process_tile<0, 1, 3, 6, 0, 2>,
-            Conv::template process_tile<0, 1, 3, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 3, 6, 1, 0>,
-            Conv::template process_tile<0, 1, 3, 6, 1, 1>,
-            Conv::template process_tile<0, 1, 3, 6, 1, 2>,
-            Conv::template process_tile<0, 1, 3, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 3, 6, 2, 0>,
-            Conv::template process_tile<0, 1, 3, 6, 2, 1>,
-            Conv::template process_tile<0, 1, 3, 6, 2, 2>,
-            Conv::template process_tile<0, 1, 3, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 3, 6, 3, 0>,
-            Conv::template process_tile<0, 1, 3, 6, 3, 1>,
-            Conv::template process_tile<0, 1, 3, 6, 3, 2>,
-            Conv::template process_tile<0, 1, 3, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 3
-      {  // Input pad bottom = 4
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 4, 0, 0, 2>,
-            Conv::template process_tile<0, 1, 4, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 4, 0, 1, 2>,
-            Conv::template process_tile<0, 1, 4, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 4, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 4, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 4, 0, 2, 2>,
-            Conv::template process_tile<0, 1, 4, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 4, 0, 3, 0>,
-            Conv::template process_tile<0, 1, 4, 0, 3, 1>,
-            Conv::template process_tile<0, 1, 4, 0, 3, 2>,
-            Conv::template process_tile<0, 1, 4, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 4, 1, 0, 2>,
-            Conv::template process_tile<0, 1, 4, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 4, 1, 1, 2>,
-            Conv::template process_tile<0, 1, 4, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 4, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 4, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 4, 1, 2, 2>,
-            Conv::template process_tile<0, 1, 4, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 4, 1, 3, 0>,
-            Conv::template process_tile<0, 1, 4, 1, 3, 1>,
-            Conv::template process_tile<0, 1, 4, 1, 3, 2>,
-            Conv::template process_tile<0, 1, 4, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 4, 2, 0, 2>,
-            Conv::template process_tile<0, 1, 4, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 4, 2, 1, 2>,
-            Conv::template process_tile<0, 1, 4, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 4, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 4, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 4, 2, 2, 2>,
-            Conv::template process_tile<0, 1, 4, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 4, 2, 3, 0>,
-            Conv::template process_tile<0, 1, 4, 2, 3, 1>,
-            Conv::template process_tile<0, 1, 4, 2, 3, 2>,
-            Conv::template process_tile<0, 1, 4, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 4, 3, 0, 2>,
-            Conv::template process_tile<0, 1, 4, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 4, 3, 1, 2>,
-            Conv::template process_tile<0, 1, 4, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 4, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 4, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 4, 3, 2, 2>,
-            Conv::template process_tile<0, 1, 4, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 4, 3, 3, 0>,
-            Conv::template process_tile<0, 1, 4, 3, 3, 1>,
-            Conv::template process_tile<0, 1, 4, 3, 3, 2>,
-            Conv::template process_tile<0, 1, 4, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 4, 0, 1>,
-            Conv::template process_tile<0, 1, 4, 4, 0, 2>,
-            Conv::template process_tile<0, 1, 4, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 4, 1, 1>,
-            Conv::template process_tile<0, 1, 4, 4, 1, 2>,
-            Conv::template process_tile<0, 1, 4, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 4, 4, 2, 0>,
-            Conv::template process_tile<0, 1, 4, 4, 2, 1>,
-            Conv::template process_tile<0, 1, 4, 4, 2, 2>,
-            Conv::template process_tile<0, 1, 4, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 4, 4, 3, 0>,
-            Conv::template process_tile<0, 1, 4, 4, 3, 1>,
-            Conv::template process_tile<0, 1, 4, 4, 3, 2>,
-            Conv::template process_tile<0, 1, 4, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 5, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 5, 0, 1>,
-            Conv::template process_tile<0, 1, 4, 5, 0, 2>,
-            Conv::template process_tile<0, 1, 4, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 5, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 5, 1, 1>,
-            Conv::template process_tile<0, 1, 4, 5, 1, 2>,
-            Conv::template process_tile<0, 1, 4, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 4, 5, 2, 0>,
-            Conv::template process_tile<0, 1, 4, 5, 2, 1>,
-            Conv::template process_tile<0, 1, 4, 5, 2, 2>,
-            Conv::template process_tile<0, 1, 4, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 4, 5, 3, 0>,
-            Conv::template process_tile<0, 1, 4, 5, 3, 1>,
-            Conv::template process_tile<0, 1, 4, 5, 3, 2>,
-            Conv::template process_tile<0, 1, 4, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 4, 6, 0, 0>,
-            Conv::template process_tile<0, 1, 4, 6, 0, 1>,
-            Conv::template process_tile<0, 1, 4, 6, 0, 2>,
-            Conv::template process_tile<0, 1, 4, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 4, 6, 1, 0>,
-            Conv::template process_tile<0, 1, 4, 6, 1, 1>,
-            Conv::template process_tile<0, 1, 4, 6, 1, 2>,
-            Conv::template process_tile<0, 1, 4, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 4, 6, 2, 0>,
-            Conv::template process_tile<0, 1, 4, 6, 2, 1>,
-            Conv::template process_tile<0, 1, 4, 6, 2, 2>,
-            Conv::template process_tile<0, 1, 4, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 4, 6, 3, 0>,
-            Conv::template process_tile<0, 1, 4, 6, 3, 1>,
-            Conv::template process_tile<0, 1, 4, 6, 3, 2>,
-            Conv::template process_tile<0, 1, 4, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 4
-      {  // Input pad bottom = 5
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 5, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 5, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 5, 0, 0, 2>,
-            Conv::template process_tile<0, 1, 5, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 5, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 5, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 5, 0, 1, 2>,
-            Conv::template process_tile<0, 1, 5, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 5, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 5, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 5, 0, 2, 2>,
-            Conv::template process_tile<0, 1, 5, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 5, 0, 3, 0>,
-            Conv::template process_tile<0, 1, 5, 0, 3, 1>,
-            Conv::template process_tile<0, 1, 5, 0, 3, 2>,
-            Conv::template process_tile<0, 1, 5, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 5, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 5, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 5, 1, 0, 2>,
-            Conv::template process_tile<0, 1, 5, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 5, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 5, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 5, 1, 1, 2>,
-            Conv::template process_tile<0, 1, 5, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 5, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 5, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 5, 1, 2, 2>,
-            Conv::template process_tile<0, 1, 5, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 5, 1, 3, 0>,
-            Conv::template process_tile<0, 1, 5, 1, 3, 1>,
-            Conv::template process_tile<0, 1, 5, 1, 3, 2>,
-            Conv::template process_tile<0, 1, 5, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 5, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 5, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 5, 2, 0, 2>,
-            Conv::template process_tile<0, 1, 5, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 5, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 5, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 5, 2, 1, 2>,
-            Conv::template process_tile<0, 1, 5, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 5, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 5, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 5, 2, 2, 2>,
-            Conv::template process_tile<0, 1, 5, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 5, 2, 3, 0>,
-            Conv::template process_tile<0, 1, 5, 2, 3, 1>,
-            Conv::template process_tile<0, 1, 5, 2, 3, 2>,
-            Conv::template process_tile<0, 1, 5, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 5, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 5, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 5, 3, 0, 2>,
-            Conv::template process_tile<0, 1, 5, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 5, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 5, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 5, 3, 1, 2>,
-            Conv::template process_tile<0, 1, 5, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 5, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 5, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 5, 3, 2, 2>,
-            Conv::template process_tile<0, 1, 5, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 5, 3, 3, 0>,
-            Conv::template process_tile<0, 1, 5, 3, 3, 1>,
-            Conv::template process_tile<0, 1, 5, 3, 3, 2>,
-            Conv::template process_tile<0, 1, 5, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 5, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 5, 4, 0, 1>,
-            Conv::template process_tile<0, 1, 5, 4, 0, 2>,
-            Conv::template process_tile<0, 1, 5, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 5, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 5, 4, 1, 1>,
-            Conv::template process_tile<0, 1, 5, 4, 1, 2>,
-            Conv::template process_tile<0, 1, 5, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 5, 4, 2, 0>,
-            Conv::template process_tile<0, 1, 5, 4, 2, 1>,
-            Conv::template process_tile<0, 1, 5, 4, 2, 2>,
-            Conv::template process_tile<0, 1, 5, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 5, 4, 3, 0>,
-            Conv::template process_tile<0, 1, 5, 4, 3, 1>,
-            Conv::template process_tile<0, 1, 5, 4, 3, 2>,
-            Conv::template process_tile<0, 1, 5, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 5, 5, 0, 0>,
-            Conv::template process_tile<0, 1, 5, 5, 0, 1>,
-            Conv::template process_tile<0, 1, 5, 5, 0, 2>,
-            Conv::template process_tile<0, 1, 5, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 5, 5, 1, 0>,
-            Conv::template process_tile<0, 1, 5, 5, 1, 1>,
-            Conv::template process_tile<0, 1, 5, 5, 1, 2>,
-            Conv::template process_tile<0, 1, 5, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 5, 5, 2, 0>,
-            Conv::template process_tile<0, 1, 5, 5, 2, 1>,
-            Conv::template process_tile<0, 1, 5, 5, 2, 2>,
-            Conv::template process_tile<0, 1, 5, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 5, 5, 3, 0>,
-            Conv::template process_tile<0, 1, 5, 5, 3, 1>,
-            Conv::template process_tile<0, 1, 5, 5, 3, 2>,
-            Conv::template process_tile<0, 1, 5, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 5, 6, 0, 0>,
-            Conv::template process_tile<0, 1, 5, 6, 0, 1>,
-            Conv::template process_tile<0, 1, 5, 6, 0, 2>,
-            Conv::template process_tile<0, 1, 5, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 5, 6, 1, 0>,
-            Conv::template process_tile<0, 1, 5, 6, 1, 1>,
-            Conv::template process_tile<0, 1, 5, 6, 1, 2>,
-            Conv::template process_tile<0, 1, 5, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 5, 6, 2, 0>,
-            Conv::template process_tile<0, 1, 5, 6, 2, 1>,
-            Conv::template process_tile<0, 1, 5, 6, 2, 2>,
-            Conv::template process_tile<0, 1, 5, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 5, 6, 3, 0>,
-            Conv::template process_tile<0, 1, 5, 6, 3, 1>,
-            Conv::template process_tile<0, 1, 5, 6, 3, 2>,
-            Conv::template process_tile<0, 1, 5, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 5
-      {  // Input pad bottom = 6
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 6, 0, 0, 0>,
-            Conv::template process_tile<0, 1, 6, 0, 0, 1>,
-            Conv::template process_tile<0, 1, 6, 0, 0, 2>,
-            Conv::template process_tile<0, 1, 6, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 6, 0, 1, 0>,
-            Conv::template process_tile<0, 1, 6, 0, 1, 1>,
-            Conv::template process_tile<0, 1, 6, 0, 1, 2>,
-            Conv::template process_tile<0, 1, 6, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 6, 0, 2, 0>,
-            Conv::template process_tile<0, 1, 6, 0, 2, 1>,
-            Conv::template process_tile<0, 1, 6, 0, 2, 2>,
-            Conv::template process_tile<0, 1, 6, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 6, 0, 3, 0>,
-            Conv::template process_tile<0, 1, 6, 0, 3, 1>,
-            Conv::template process_tile<0, 1, 6, 0, 3, 2>,
-            Conv::template process_tile<0, 1, 6, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 6, 1, 0, 0>,
-            Conv::template process_tile<0, 1, 6, 1, 0, 1>,
-            Conv::template process_tile<0, 1, 6, 1, 0, 2>,
-            Conv::template process_tile<0, 1, 6, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 6, 1, 1, 0>,
-            Conv::template process_tile<0, 1, 6, 1, 1, 1>,
-            Conv::template process_tile<0, 1, 6, 1, 1, 2>,
-            Conv::template process_tile<0, 1, 6, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 6, 1, 2, 0>,
-            Conv::template process_tile<0, 1, 6, 1, 2, 1>,
-            Conv::template process_tile<0, 1, 6, 1, 2, 2>,
-            Conv::template process_tile<0, 1, 6, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 6, 1, 3, 0>,
-            Conv::template process_tile<0, 1, 6, 1, 3, 1>,
-            Conv::template process_tile<0, 1, 6, 1, 3, 2>,
-            Conv::template process_tile<0, 1, 6, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 6, 2, 0, 0>,
-            Conv::template process_tile<0, 1, 6, 2, 0, 1>,
-            Conv::template process_tile<0, 1, 6, 2, 0, 2>,
-            Conv::template process_tile<0, 1, 6, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 6, 2, 1, 0>,
-            Conv::template process_tile<0, 1, 6, 2, 1, 1>,
-            Conv::template process_tile<0, 1, 6, 2, 1, 2>,
-            Conv::template process_tile<0, 1, 6, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 6, 2, 2, 0>,
-            Conv::template process_tile<0, 1, 6, 2, 2, 1>,
-            Conv::template process_tile<0, 1, 6, 2, 2, 2>,
-            Conv::template process_tile<0, 1, 6, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 6, 2, 3, 0>,
-            Conv::template process_tile<0, 1, 6, 2, 3, 1>,
-            Conv::template process_tile<0, 1, 6, 2, 3, 2>,
-            Conv::template process_tile<0, 1, 6, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 6, 3, 0, 0>,
-            Conv::template process_tile<0, 1, 6, 3, 0, 1>,
-            Conv::template process_tile<0, 1, 6, 3, 0, 2>,
-            Conv::template process_tile<0, 1, 6, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 6, 3, 1, 0>,
-            Conv::template process_tile<0, 1, 6, 3, 1, 1>,
-            Conv::template process_tile<0, 1, 6, 3, 1, 2>,
-            Conv::template process_tile<0, 1, 6, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 6, 3, 2, 0>,
-            Conv::template process_tile<0, 1, 6, 3, 2, 1>,
-            Conv::template process_tile<0, 1, 6, 3, 2, 2>,
-            Conv::template process_tile<0, 1, 6, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 6, 3, 3, 0>,
-            Conv::template process_tile<0, 1, 6, 3, 3, 1>,
-            Conv::template process_tile<0, 1, 6, 3, 3, 2>,
-            Conv::template process_tile<0, 1, 6, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 6, 4, 0, 0>,
-            Conv::template process_tile<0, 1, 6, 4, 0, 1>,
-            Conv::template process_tile<0, 1, 6, 4, 0, 2>,
-            Conv::template process_tile<0, 1, 6, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 6, 4, 1, 0>,
-            Conv::template process_tile<0, 1, 6, 4, 1, 1>,
-            Conv::template process_tile<0, 1, 6, 4, 1, 2>,
-            Conv::template process_tile<0, 1, 6, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 6, 4, 2, 0>,
-            Conv::template process_tile<0, 1, 6, 4, 2, 1>,
-            Conv::template process_tile<0, 1, 6, 4, 2, 2>,
-            Conv::template process_tile<0, 1, 6, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 6, 4, 3, 0>,
-            Conv::template process_tile<0, 1, 6, 4, 3, 1>,
-            Conv::template process_tile<0, 1, 6, 4, 3, 2>,
-            Conv::template process_tile<0, 1, 6, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 6, 5, 0, 0>,
-            Conv::template process_tile<0, 1, 6, 5, 0, 1>,
-            Conv::template process_tile<0, 1, 6, 5, 0, 2>,
-            Conv::template process_tile<0, 1, 6, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 6, 5, 1, 0>,
-            Conv::template process_tile<0, 1, 6, 5, 1, 1>,
-            Conv::template process_tile<0, 1, 6, 5, 1, 2>,
-            Conv::template process_tile<0, 1, 6, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 6, 5, 2, 0>,
-            Conv::template process_tile<0, 1, 6, 5, 2, 1>,
-            Conv::template process_tile<0, 1, 6, 5, 2, 2>,
-            Conv::template process_tile<0, 1, 6, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 6, 5, 3, 0>,
-            Conv::template process_tile<0, 1, 6, 5, 3, 1>,
-            Conv::template process_tile<0, 1, 6, 5, 3, 2>,
-            Conv::template process_tile<0, 1, 6, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<0, 1, 6, 6, 0, 0>,
-            Conv::template process_tile<0, 1, 6, 6, 0, 1>,
-            Conv::template process_tile<0, 1, 6, 6, 0, 2>,
-            Conv::template process_tile<0, 1, 6, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<0, 1, 6, 6, 1, 0>,
-            Conv::template process_tile<0, 1, 6, 6, 1, 1>,
-            Conv::template process_tile<0, 1, 6, 6, 1, 2>,
-            Conv::template process_tile<0, 1, 6, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<0, 1, 6, 6, 2, 0>,
-            Conv::template process_tile<0, 1, 6, 6, 2, 1>,
-            Conv::template process_tile<0, 1, 6, 6, 2, 2>,
-            Conv::template process_tile<0, 1, 6, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<0, 1, 6, 6, 3, 0>,
-            Conv::template process_tile<0, 1, 6, 6, 3, 1>,
-            Conv::template process_tile<0, 1, 6, 6, 3, 2>,
-            Conv::template process_tile<0, 1, 6, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 6
-    },  // Input pad left = 1
-  },  // Input pad top = 0
-  {  // Input pad top = 1
-    {  // Input pad left = 0
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 0, 0, 2>,
-            Conv::template process_tile<1, 0, 0, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 0, 1, 2>,
-            Conv::template process_tile<1, 0, 0, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 0, 2, 2>,
-            Conv::template process_tile<1, 0, 0, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 0, 0, 3, 0>,
-            Conv::template process_tile<1, 0, 0, 0, 3, 1>,
-            Conv::template process_tile<1, 0, 0, 0, 3, 2>,
-            Conv::template process_tile<1, 0, 0, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 1, 0, 2>,
-            Conv::template process_tile<1, 0, 0, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 1, 1, 2>,
-            Conv::template process_tile<1, 0, 0, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 1, 2, 2>,
-            Conv::template process_tile<1, 0, 0, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 0, 1, 3, 0>,
-            Conv::template process_tile<1, 0, 0, 1, 3, 1>,
-            Conv::template process_tile<1, 0, 0, 1, 3, 2>,
-            Conv::template process_tile<1, 0, 0, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 2, 0, 2>,
-            Conv::template process_tile<1, 0, 0, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 2, 1, 2>,
-            Conv::template process_tile<1, 0, 0, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 2, 2, 2>,
-            Conv::template process_tile<1, 0, 0, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 0, 2, 3, 0>,
-            Conv::template process_tile<1, 0, 0, 2, 3, 1>,
-            Conv::template process_tile<1, 0, 0, 2, 3, 2>,
-            Conv::template process_tile<1, 0, 0, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 3, 0, 2>,
-            Conv::template process_tile<1, 0, 0, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 3, 1, 2>,
-            Conv::template process_tile<1, 0, 0, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 3, 2, 2>,
-            Conv::template process_tile<1, 0, 0, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 0, 3, 3, 0>,
-            Conv::template process_tile<1, 0, 0, 3, 3, 1>,
-            Conv::template process_tile<1, 0, 0, 3, 3, 2>,
-            Conv::template process_tile<1, 0, 0, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 4, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 4, 0, 2>,
-            Conv::template process_tile<1, 0, 0, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 4, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 4, 1, 2>,
-            Conv::template process_tile<1, 0, 0, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 4, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 4, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 4, 2, 2>,
-            Conv::template process_tile<1, 0, 0, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 0, 4, 3, 0>,
-            Conv::template process_tile<1, 0, 0, 4, 3, 1>,
-            Conv::template process_tile<1, 0, 0, 4, 3, 2>,
-            Conv::template process_tile<1, 0, 0, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 5, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 5, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 5, 0, 2>,
-            Conv::template process_tile<1, 0, 0, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 5, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 5, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 5, 1, 2>,
-            Conv::template process_tile<1, 0, 0, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 5, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 5, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 5, 2, 2>,
-            Conv::template process_tile<1, 0, 0, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 0, 5, 3, 0>,
-            Conv::template process_tile<1, 0, 0, 5, 3, 1>,
-            Conv::template process_tile<1, 0, 0, 5, 3, 2>,
-            Conv::template process_tile<1, 0, 0, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 0, 6, 0, 0>,
-            Conv::template process_tile<1, 0, 0, 6, 0, 1>,
-            Conv::template process_tile<1, 0, 0, 6, 0, 2>,
-            Conv::template process_tile<1, 0, 0, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 0, 6, 1, 0>,
-            Conv::template process_tile<1, 0, 0, 6, 1, 1>,
-            Conv::template process_tile<1, 0, 0, 6, 1, 2>,
-            Conv::template process_tile<1, 0, 0, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 0, 6, 2, 0>,
-            Conv::template process_tile<1, 0, 0, 6, 2, 1>,
-            Conv::template process_tile<1, 0, 0, 6, 2, 2>,
-            Conv::template process_tile<1, 0, 0, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 0, 6, 3, 0>,
-            Conv::template process_tile<1, 0, 0, 6, 3, 1>,
-            Conv::template process_tile<1, 0, 0, 6, 3, 2>,
-            Conv::template process_tile<1, 0, 0, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 0, 0, 2>,
-            Conv::template process_tile<1, 0, 1, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 0, 1, 2>,
-            Conv::template process_tile<1, 0, 1, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 0, 2, 2>,
-            Conv::template process_tile<1, 0, 1, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 1, 0, 3, 0>,
-            Conv::template process_tile<1, 0, 1, 0, 3, 1>,
-            Conv::template process_tile<1, 0, 1, 0, 3, 2>,
-            Conv::template process_tile<1, 0, 1, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 1, 0, 2>,
-            Conv::template process_tile<1, 0, 1, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 1, 1, 2>,
-            Conv::template process_tile<1, 0, 1, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 1, 2, 2>,
-            Conv::template process_tile<1, 0, 1, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 1, 1, 3, 0>,
-            Conv::template process_tile<1, 0, 1, 1, 3, 1>,
-            Conv::template process_tile<1, 0, 1, 1, 3, 2>,
-            Conv::template process_tile<1, 0, 1, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 2, 0, 2>,
-            Conv::template process_tile<1, 0, 1, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 2, 1, 2>,
-            Conv::template process_tile<1, 0, 1, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 2, 2, 2>,
-            Conv::template process_tile<1, 0, 1, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 1, 2, 3, 0>,
-            Conv::template process_tile<1, 0, 1, 2, 3, 1>,
-            Conv::template process_tile<1, 0, 1, 2, 3, 2>,
-            Conv::template process_tile<1, 0, 1, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 3, 0, 2>,
-            Conv::template process_tile<1, 0, 1, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 3, 1, 2>,
-            Conv::template process_tile<1, 0, 1, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 3, 2, 2>,
-            Conv::template process_tile<1, 0, 1, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 1, 3, 3, 0>,
-            Conv::template process_tile<1, 0, 1, 3, 3, 1>,
-            Conv::template process_tile<1, 0, 1, 3, 3, 2>,
-            Conv::template process_tile<1, 0, 1, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 4, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 4, 0, 2>,
-            Conv::template process_tile<1, 0, 1, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 4, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 4, 1, 2>,
-            Conv::template process_tile<1, 0, 1, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 4, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 4, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 4, 2, 2>,
-            Conv::template process_tile<1, 0, 1, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 1, 4, 3, 0>,
-            Conv::template process_tile<1, 0, 1, 4, 3, 1>,
-            Conv::template process_tile<1, 0, 1, 4, 3, 2>,
-            Conv::template process_tile<1, 0, 1, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 5, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 5, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 5, 0, 2>,
-            Conv::template process_tile<1, 0, 1, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 5, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 5, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 5, 1, 2>,
-            Conv::template process_tile<1, 0, 1, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 5, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 5, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 5, 2, 2>,
-            Conv::template process_tile<1, 0, 1, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 1, 5, 3, 0>,
-            Conv::template process_tile<1, 0, 1, 5, 3, 1>,
-            Conv::template process_tile<1, 0, 1, 5, 3, 2>,
-            Conv::template process_tile<1, 0, 1, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 1, 6, 0, 0>,
-            Conv::template process_tile<1, 0, 1, 6, 0, 1>,
-            Conv::template process_tile<1, 0, 1, 6, 0, 2>,
-            Conv::template process_tile<1, 0, 1, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 1, 6, 1, 0>,
-            Conv::template process_tile<1, 0, 1, 6, 1, 1>,
-            Conv::template process_tile<1, 0, 1, 6, 1, 2>,
-            Conv::template process_tile<1, 0, 1, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 1, 6, 2, 0>,
-            Conv::template process_tile<1, 0, 1, 6, 2, 1>,
-            Conv::template process_tile<1, 0, 1, 6, 2, 2>,
-            Conv::template process_tile<1, 0, 1, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 1, 6, 3, 0>,
-            Conv::template process_tile<1, 0, 1, 6, 3, 1>,
-            Conv::template process_tile<1, 0, 1, 6, 3, 2>,
-            Conv::template process_tile<1, 0, 1, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 0, 0, 2>,
-            Conv::template process_tile<1, 0, 2, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 0, 1, 2>,
-            Conv::template process_tile<1, 0, 2, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 0, 2, 2>,
-            Conv::template process_tile<1, 0, 2, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 2, 0, 3, 0>,
-            Conv::template process_tile<1, 0, 2, 0, 3, 1>,
-            Conv::template process_tile<1, 0, 2, 0, 3, 2>,
-            Conv::template process_tile<1, 0, 2, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 1, 0, 2>,
-            Conv::template process_tile<1, 0, 2, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 1, 1, 2>,
-            Conv::template process_tile<1, 0, 2, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 1, 2, 2>,
-            Conv::template process_tile<1, 0, 2, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 2, 1, 3, 0>,
-            Conv::template process_tile<1, 0, 2, 1, 3, 1>,
-            Conv::template process_tile<1, 0, 2, 1, 3, 2>,
-            Conv::template process_tile<1, 0, 2, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 2, 0, 2>,
-            Conv::template process_tile<1, 0, 2, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 2, 1, 2>,
-            Conv::template process_tile<1, 0, 2, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 2, 2, 2>,
-            Conv::template process_tile<1, 0, 2, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 2, 2, 3, 0>,
-            Conv::template process_tile<1, 0, 2, 2, 3, 1>,
-            Conv::template process_tile<1, 0, 2, 2, 3, 2>,
-            Conv::template process_tile<1, 0, 2, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 3, 0, 2>,
-            Conv::template process_tile<1, 0, 2, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 3, 1, 2>,
-            Conv::template process_tile<1, 0, 2, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 3, 2, 2>,
-            Conv::template process_tile<1, 0, 2, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 2, 3, 3, 0>,
-            Conv::template process_tile<1, 0, 2, 3, 3, 1>,
-            Conv::template process_tile<1, 0, 2, 3, 3, 2>,
-            Conv::template process_tile<1, 0, 2, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 4, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 4, 0, 2>,
-            Conv::template process_tile<1, 0, 2, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 4, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 4, 1, 2>,
-            Conv::template process_tile<1, 0, 2, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 4, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 4, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 4, 2, 2>,
-            Conv::template process_tile<1, 0, 2, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 2, 4, 3, 0>,
-            Conv::template process_tile<1, 0, 2, 4, 3, 1>,
-            Conv::template process_tile<1, 0, 2, 4, 3, 2>,
-            Conv::template process_tile<1, 0, 2, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 5, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 5, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 5, 0, 2>,
-            Conv::template process_tile<1, 0, 2, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 5, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 5, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 5, 1, 2>,
-            Conv::template process_tile<1, 0, 2, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 5, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 5, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 5, 2, 2>,
-            Conv::template process_tile<1, 0, 2, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 2, 5, 3, 0>,
-            Conv::template process_tile<1, 0, 2, 5, 3, 1>,
-            Conv::template process_tile<1, 0, 2, 5, 3, 2>,
-            Conv::template process_tile<1, 0, 2, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 2, 6, 0, 0>,
-            Conv::template process_tile<1, 0, 2, 6, 0, 1>,
-            Conv::template process_tile<1, 0, 2, 6, 0, 2>,
-            Conv::template process_tile<1, 0, 2, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 2, 6, 1, 0>,
-            Conv::template process_tile<1, 0, 2, 6, 1, 1>,
-            Conv::template process_tile<1, 0, 2, 6, 1, 2>,
-            Conv::template process_tile<1, 0, 2, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 2, 6, 2, 0>,
-            Conv::template process_tile<1, 0, 2, 6, 2, 1>,
-            Conv::template process_tile<1, 0, 2, 6, 2, 2>,
-            Conv::template process_tile<1, 0, 2, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 2, 6, 3, 0>,
-            Conv::template process_tile<1, 0, 2, 6, 3, 1>,
-            Conv::template process_tile<1, 0, 2, 6, 3, 2>,
-            Conv::template process_tile<1, 0, 2, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 0, 0, 2>,
-            Conv::template process_tile<1, 0, 3, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 0, 1, 2>,
-            Conv::template process_tile<1, 0, 3, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 0, 2, 2>,
-            Conv::template process_tile<1, 0, 3, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 3, 0, 3, 0>,
-            Conv::template process_tile<1, 0, 3, 0, 3, 1>,
-            Conv::template process_tile<1, 0, 3, 0, 3, 2>,
-            Conv::template process_tile<1, 0, 3, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 1, 0, 2>,
-            Conv::template process_tile<1, 0, 3, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 1, 1, 2>,
-            Conv::template process_tile<1, 0, 3, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 1, 2, 2>,
-            Conv::template process_tile<1, 0, 3, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 3, 1, 3, 0>,
-            Conv::template process_tile<1, 0, 3, 1, 3, 1>,
-            Conv::template process_tile<1, 0, 3, 1, 3, 2>,
-            Conv::template process_tile<1, 0, 3, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 2, 0, 2>,
-            Conv::template process_tile<1, 0, 3, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 2, 1, 2>,
-            Conv::template process_tile<1, 0, 3, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 2, 2, 2>,
-            Conv::template process_tile<1, 0, 3, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 3, 2, 3, 0>,
-            Conv::template process_tile<1, 0, 3, 2, 3, 1>,
-            Conv::template process_tile<1, 0, 3, 2, 3, 2>,
-            Conv::template process_tile<1, 0, 3, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 3, 0, 2>,
-            Conv::template process_tile<1, 0, 3, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 3, 1, 2>,
-            Conv::template process_tile<1, 0, 3, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 3, 2, 2>,
-            Conv::template process_tile<1, 0, 3, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 3, 3, 3, 0>,
-            Conv::template process_tile<1, 0, 3, 3, 3, 1>,
-            Conv::template process_tile<1, 0, 3, 3, 3, 2>,
-            Conv::template process_tile<1, 0, 3, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 4, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 4, 0, 2>,
-            Conv::template process_tile<1, 0, 3, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 4, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 4, 1, 2>,
-            Conv::template process_tile<1, 0, 3, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 4, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 4, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 4, 2, 2>,
-            Conv::template process_tile<1, 0, 3, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 3, 4, 3, 0>,
-            Conv::template process_tile<1, 0, 3, 4, 3, 1>,
-            Conv::template process_tile<1, 0, 3, 4, 3, 2>,
-            Conv::template process_tile<1, 0, 3, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 5, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 5, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 5, 0, 2>,
-            Conv::template process_tile<1, 0, 3, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 5, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 5, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 5, 1, 2>,
-            Conv::template process_tile<1, 0, 3, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 5, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 5, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 5, 2, 2>,
-            Conv::template process_tile<1, 0, 3, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 3, 5, 3, 0>,
-            Conv::template process_tile<1, 0, 3, 5, 3, 1>,
-            Conv::template process_tile<1, 0, 3, 5, 3, 2>,
-            Conv::template process_tile<1, 0, 3, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 3, 6, 0, 0>,
-            Conv::template process_tile<1, 0, 3, 6, 0, 1>,
-            Conv::template process_tile<1, 0, 3, 6, 0, 2>,
-            Conv::template process_tile<1, 0, 3, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 3, 6, 1, 0>,
-            Conv::template process_tile<1, 0, 3, 6, 1, 1>,
-            Conv::template process_tile<1, 0, 3, 6, 1, 2>,
-            Conv::template process_tile<1, 0, 3, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 3, 6, 2, 0>,
-            Conv::template process_tile<1, 0, 3, 6, 2, 1>,
-            Conv::template process_tile<1, 0, 3, 6, 2, 2>,
-            Conv::template process_tile<1, 0, 3, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 3, 6, 3, 0>,
-            Conv::template process_tile<1, 0, 3, 6, 3, 1>,
-            Conv::template process_tile<1, 0, 3, 6, 3, 2>,
-            Conv::template process_tile<1, 0, 3, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 3
-      {  // Input pad bottom = 4
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 4, 0, 0, 2>,
-            Conv::template process_tile<1, 0, 4, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 4, 0, 1, 2>,
-            Conv::template process_tile<1, 0, 4, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 4, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 4, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 4, 0, 2, 2>,
-            Conv::template process_tile<1, 0, 4, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 4, 0, 3, 0>,
-            Conv::template process_tile<1, 0, 4, 0, 3, 1>,
-            Conv::template process_tile<1, 0, 4, 0, 3, 2>,
-            Conv::template process_tile<1, 0, 4, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 4, 1, 0, 2>,
-            Conv::template process_tile<1, 0, 4, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 4, 1, 1, 2>,
-            Conv::template process_tile<1, 0, 4, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 4, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 4, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 4, 1, 2, 2>,
-            Conv::template process_tile<1, 0, 4, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 4, 1, 3, 0>,
-            Conv::template process_tile<1, 0, 4, 1, 3, 1>,
-            Conv::template process_tile<1, 0, 4, 1, 3, 2>,
-            Conv::template process_tile<1, 0, 4, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 4, 2, 0, 2>,
-            Conv::template process_tile<1, 0, 4, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 4, 2, 1, 2>,
-            Conv::template process_tile<1, 0, 4, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 4, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 4, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 4, 2, 2, 2>,
-            Conv::template process_tile<1, 0, 4, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 4, 2, 3, 0>,
-            Conv::template process_tile<1, 0, 4, 2, 3, 1>,
-            Conv::template process_tile<1, 0, 4, 2, 3, 2>,
-            Conv::template process_tile<1, 0, 4, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 4, 3, 0, 2>,
-            Conv::template process_tile<1, 0, 4, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 4, 3, 1, 2>,
-            Conv::template process_tile<1, 0, 4, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 4, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 4, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 4, 3, 2, 2>,
-            Conv::template process_tile<1, 0, 4, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 4, 3, 3, 0>,
-            Conv::template process_tile<1, 0, 4, 3, 3, 1>,
-            Conv::template process_tile<1, 0, 4, 3, 3, 2>,
-            Conv::template process_tile<1, 0, 4, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 4, 0, 1>,
-            Conv::template process_tile<1, 0, 4, 4, 0, 2>,
-            Conv::template process_tile<1, 0, 4, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 4, 1, 1>,
-            Conv::template process_tile<1, 0, 4, 4, 1, 2>,
-            Conv::template process_tile<1, 0, 4, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 4, 4, 2, 0>,
-            Conv::template process_tile<1, 0, 4, 4, 2, 1>,
-            Conv::template process_tile<1, 0, 4, 4, 2, 2>,
-            Conv::template process_tile<1, 0, 4, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 4, 4, 3, 0>,
-            Conv::template process_tile<1, 0, 4, 4, 3, 1>,
-            Conv::template process_tile<1, 0, 4, 4, 3, 2>,
-            Conv::template process_tile<1, 0, 4, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 5, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 5, 0, 1>,
-            Conv::template process_tile<1, 0, 4, 5, 0, 2>,
-            Conv::template process_tile<1, 0, 4, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 5, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 5, 1, 1>,
-            Conv::template process_tile<1, 0, 4, 5, 1, 2>,
-            Conv::template process_tile<1, 0, 4, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 4, 5, 2, 0>,
-            Conv::template process_tile<1, 0, 4, 5, 2, 1>,
-            Conv::template process_tile<1, 0, 4, 5, 2, 2>,
-            Conv::template process_tile<1, 0, 4, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 4, 5, 3, 0>,
-            Conv::template process_tile<1, 0, 4, 5, 3, 1>,
-            Conv::template process_tile<1, 0, 4, 5, 3, 2>,
-            Conv::template process_tile<1, 0, 4, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 4, 6, 0, 0>,
-            Conv::template process_tile<1, 0, 4, 6, 0, 1>,
-            Conv::template process_tile<1, 0, 4, 6, 0, 2>,
-            Conv::template process_tile<1, 0, 4, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 4, 6, 1, 0>,
-            Conv::template process_tile<1, 0, 4, 6, 1, 1>,
-            Conv::template process_tile<1, 0, 4, 6, 1, 2>,
-            Conv::template process_tile<1, 0, 4, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 4, 6, 2, 0>,
-            Conv::template process_tile<1, 0, 4, 6, 2, 1>,
-            Conv::template process_tile<1, 0, 4, 6, 2, 2>,
-            Conv::template process_tile<1, 0, 4, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 4, 6, 3, 0>,
-            Conv::template process_tile<1, 0, 4, 6, 3, 1>,
-            Conv::template process_tile<1, 0, 4, 6, 3, 2>,
-            Conv::template process_tile<1, 0, 4, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 4
-      {  // Input pad bottom = 5
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 5, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 5, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 5, 0, 0, 2>,
-            Conv::template process_tile<1, 0, 5, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 5, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 5, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 5, 0, 1, 2>,
-            Conv::template process_tile<1, 0, 5, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 5, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 5, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 5, 0, 2, 2>,
-            Conv::template process_tile<1, 0, 5, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 5, 0, 3, 0>,
-            Conv::template process_tile<1, 0, 5, 0, 3, 1>,
-            Conv::template process_tile<1, 0, 5, 0, 3, 2>,
-            Conv::template process_tile<1, 0, 5, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 5, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 5, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 5, 1, 0, 2>,
-            Conv::template process_tile<1, 0, 5, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 5, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 5, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 5, 1, 1, 2>,
-            Conv::template process_tile<1, 0, 5, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 5, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 5, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 5, 1, 2, 2>,
-            Conv::template process_tile<1, 0, 5, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 5, 1, 3, 0>,
-            Conv::template process_tile<1, 0, 5, 1, 3, 1>,
-            Conv::template process_tile<1, 0, 5, 1, 3, 2>,
-            Conv::template process_tile<1, 0, 5, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 5, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 5, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 5, 2, 0, 2>,
-            Conv::template process_tile<1, 0, 5, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 5, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 5, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 5, 2, 1, 2>,
-            Conv::template process_tile<1, 0, 5, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 5, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 5, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 5, 2, 2, 2>,
-            Conv::template process_tile<1, 0, 5, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 5, 2, 3, 0>,
-            Conv::template process_tile<1, 0, 5, 2, 3, 1>,
-            Conv::template process_tile<1, 0, 5, 2, 3, 2>,
-            Conv::template process_tile<1, 0, 5, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 5, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 5, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 5, 3, 0, 2>,
-            Conv::template process_tile<1, 0, 5, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 5, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 5, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 5, 3, 1, 2>,
-            Conv::template process_tile<1, 0, 5, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 5, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 5, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 5, 3, 2, 2>,
-            Conv::template process_tile<1, 0, 5, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 5, 3, 3, 0>,
-            Conv::template process_tile<1, 0, 5, 3, 3, 1>,
-            Conv::template process_tile<1, 0, 5, 3, 3, 2>,
-            Conv::template process_tile<1, 0, 5, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 5, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 5, 4, 0, 1>,
-            Conv::template process_tile<1, 0, 5, 4, 0, 2>,
-            Conv::template process_tile<1, 0, 5, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 5, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 5, 4, 1, 1>,
-            Conv::template process_tile<1, 0, 5, 4, 1, 2>,
-            Conv::template process_tile<1, 0, 5, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 5, 4, 2, 0>,
-            Conv::template process_tile<1, 0, 5, 4, 2, 1>,
-            Conv::template process_tile<1, 0, 5, 4, 2, 2>,
-            Conv::template process_tile<1, 0, 5, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 5, 4, 3, 0>,
-            Conv::template process_tile<1, 0, 5, 4, 3, 1>,
-            Conv::template process_tile<1, 0, 5, 4, 3, 2>,
-            Conv::template process_tile<1, 0, 5, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 5, 5, 0, 0>,
-            Conv::template process_tile<1, 0, 5, 5, 0, 1>,
-            Conv::template process_tile<1, 0, 5, 5, 0, 2>,
-            Conv::template process_tile<1, 0, 5, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 5, 5, 1, 0>,
-            Conv::template process_tile<1, 0, 5, 5, 1, 1>,
-            Conv::template process_tile<1, 0, 5, 5, 1, 2>,
-            Conv::template process_tile<1, 0, 5, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 5, 5, 2, 0>,
-            Conv::template process_tile<1, 0, 5, 5, 2, 1>,
-            Conv::template process_tile<1, 0, 5, 5, 2, 2>,
-            Conv::template process_tile<1, 0, 5, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 5, 5, 3, 0>,
-            Conv::template process_tile<1, 0, 5, 5, 3, 1>,
-            Conv::template process_tile<1, 0, 5, 5, 3, 2>,
-            Conv::template process_tile<1, 0, 5, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 5, 6, 0, 0>,
-            Conv::template process_tile<1, 0, 5, 6, 0, 1>,
-            Conv::template process_tile<1, 0, 5, 6, 0, 2>,
-            Conv::template process_tile<1, 0, 5, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 5, 6, 1, 0>,
-            Conv::template process_tile<1, 0, 5, 6, 1, 1>,
-            Conv::template process_tile<1, 0, 5, 6, 1, 2>,
-            Conv::template process_tile<1, 0, 5, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 5, 6, 2, 0>,
-            Conv::template process_tile<1, 0, 5, 6, 2, 1>,
-            Conv::template process_tile<1, 0, 5, 6, 2, 2>,
-            Conv::template process_tile<1, 0, 5, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 5, 6, 3, 0>,
-            Conv::template process_tile<1, 0, 5, 6, 3, 1>,
-            Conv::template process_tile<1, 0, 5, 6, 3, 2>,
-            Conv::template process_tile<1, 0, 5, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 5
-      {  // Input pad bottom = 6
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 6, 0, 0, 0>,
-            Conv::template process_tile<1, 0, 6, 0, 0, 1>,
-            Conv::template process_tile<1, 0, 6, 0, 0, 2>,
-            Conv::template process_tile<1, 0, 6, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 6, 0, 1, 0>,
-            Conv::template process_tile<1, 0, 6, 0, 1, 1>,
-            Conv::template process_tile<1, 0, 6, 0, 1, 2>,
-            Conv::template process_tile<1, 0, 6, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 6, 0, 2, 0>,
-            Conv::template process_tile<1, 0, 6, 0, 2, 1>,
-            Conv::template process_tile<1, 0, 6, 0, 2, 2>,
-            Conv::template process_tile<1, 0, 6, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 6, 0, 3, 0>,
-            Conv::template process_tile<1, 0, 6, 0, 3, 1>,
-            Conv::template process_tile<1, 0, 6, 0, 3, 2>,
-            Conv::template process_tile<1, 0, 6, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 6, 1, 0, 0>,
-            Conv::template process_tile<1, 0, 6, 1, 0, 1>,
-            Conv::template process_tile<1, 0, 6, 1, 0, 2>,
-            Conv::template process_tile<1, 0, 6, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 6, 1, 1, 0>,
-            Conv::template process_tile<1, 0, 6, 1, 1, 1>,
-            Conv::template process_tile<1, 0, 6, 1, 1, 2>,
-            Conv::template process_tile<1, 0, 6, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 6, 1, 2, 0>,
-            Conv::template process_tile<1, 0, 6, 1, 2, 1>,
-            Conv::template process_tile<1, 0, 6, 1, 2, 2>,
-            Conv::template process_tile<1, 0, 6, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 6, 1, 3, 0>,
-            Conv::template process_tile<1, 0, 6, 1, 3, 1>,
-            Conv::template process_tile<1, 0, 6, 1, 3, 2>,
-            Conv::template process_tile<1, 0, 6, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 6, 2, 0, 0>,
-            Conv::template process_tile<1, 0, 6, 2, 0, 1>,
-            Conv::template process_tile<1, 0, 6, 2, 0, 2>,
-            Conv::template process_tile<1, 0, 6, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 6, 2, 1, 0>,
-            Conv::template process_tile<1, 0, 6, 2, 1, 1>,
-            Conv::template process_tile<1, 0, 6, 2, 1, 2>,
-            Conv::template process_tile<1, 0, 6, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 6, 2, 2, 0>,
-            Conv::template process_tile<1, 0, 6, 2, 2, 1>,
-            Conv::template process_tile<1, 0, 6, 2, 2, 2>,
-            Conv::template process_tile<1, 0, 6, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 6, 2, 3, 0>,
-            Conv::template process_tile<1, 0, 6, 2, 3, 1>,
-            Conv::template process_tile<1, 0, 6, 2, 3, 2>,
-            Conv::template process_tile<1, 0, 6, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 6, 3, 0, 0>,
-            Conv::template process_tile<1, 0, 6, 3, 0, 1>,
-            Conv::template process_tile<1, 0, 6, 3, 0, 2>,
-            Conv::template process_tile<1, 0, 6, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 6, 3, 1, 0>,
-            Conv::template process_tile<1, 0, 6, 3, 1, 1>,
-            Conv::template process_tile<1, 0, 6, 3, 1, 2>,
-            Conv::template process_tile<1, 0, 6, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 6, 3, 2, 0>,
-            Conv::template process_tile<1, 0, 6, 3, 2, 1>,
-            Conv::template process_tile<1, 0, 6, 3, 2, 2>,
-            Conv::template process_tile<1, 0, 6, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 6, 3, 3, 0>,
-            Conv::template process_tile<1, 0, 6, 3, 3, 1>,
-            Conv::template process_tile<1, 0, 6, 3, 3, 2>,
-            Conv::template process_tile<1, 0, 6, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 6, 4, 0, 0>,
-            Conv::template process_tile<1, 0, 6, 4, 0, 1>,
-            Conv::template process_tile<1, 0, 6, 4, 0, 2>,
-            Conv::template process_tile<1, 0, 6, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 6, 4, 1, 0>,
-            Conv::template process_tile<1, 0, 6, 4, 1, 1>,
-            Conv::template process_tile<1, 0, 6, 4, 1, 2>,
-            Conv::template process_tile<1, 0, 6, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 6, 4, 2, 0>,
-            Conv::template process_tile<1, 0, 6, 4, 2, 1>,
-            Conv::template process_tile<1, 0, 6, 4, 2, 2>,
-            Conv::template process_tile<1, 0, 6, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 6, 4, 3, 0>,
-            Conv::template process_tile<1, 0, 6, 4, 3, 1>,
-            Conv::template process_tile<1, 0, 6, 4, 3, 2>,
-            Conv::template process_tile<1, 0, 6, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 6, 5, 0, 0>,
-            Conv::template process_tile<1, 0, 6, 5, 0, 1>,
-            Conv::template process_tile<1, 0, 6, 5, 0, 2>,
-            Conv::template process_tile<1, 0, 6, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 6, 5, 1, 0>,
-            Conv::template process_tile<1, 0, 6, 5, 1, 1>,
-            Conv::template process_tile<1, 0, 6, 5, 1, 2>,
-            Conv::template process_tile<1, 0, 6, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 6, 5, 2, 0>,
-            Conv::template process_tile<1, 0, 6, 5, 2, 1>,
-            Conv::template process_tile<1, 0, 6, 5, 2, 2>,
-            Conv::template process_tile<1, 0, 6, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 6, 5, 3, 0>,
-            Conv::template process_tile<1, 0, 6, 5, 3, 1>,
-            Conv::template process_tile<1, 0, 6, 5, 3, 2>,
-            Conv::template process_tile<1, 0, 6, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 0, 6, 6, 0, 0>,
-            Conv::template process_tile<1, 0, 6, 6, 0, 1>,
-            Conv::template process_tile<1, 0, 6, 6, 0, 2>,
-            Conv::template process_tile<1, 0, 6, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 0, 6, 6, 1, 0>,
-            Conv::template process_tile<1, 0, 6, 6, 1, 1>,
-            Conv::template process_tile<1, 0, 6, 6, 1, 2>,
-            Conv::template process_tile<1, 0, 6, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 0, 6, 6, 2, 0>,
-            Conv::template process_tile<1, 0, 6, 6, 2, 1>,
-            Conv::template process_tile<1, 0, 6, 6, 2, 2>,
-            Conv::template process_tile<1, 0, 6, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 0, 6, 6, 3, 0>,
-            Conv::template process_tile<1, 0, 6, 6, 3, 1>,
-            Conv::template process_tile<1, 0, 6, 6, 3, 2>,
-            Conv::template process_tile<1, 0, 6, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 6
-    },  // Input pad left = 0
-    {  // Input pad left = 1
-      {  // Input pad bottom = 0
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 0, 0, 2>,
-            Conv::template process_tile<1, 1, 0, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 0, 1, 2>,
-            Conv::template process_tile<1, 1, 0, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 0, 2, 2>,
-            Conv::template process_tile<1, 1, 0, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 0, 0, 3, 0>,
-            Conv::template process_tile<1, 1, 0, 0, 3, 1>,
-            Conv::template process_tile<1, 1, 0, 0, 3, 2>,
-            Conv::template process_tile<1, 1, 0, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 1, 0, 2>,
-            Conv::template process_tile<1, 1, 0, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 1, 1, 2>,
-            Conv::template process_tile<1, 1, 0, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 1, 2, 2>,
-            Conv::template process_tile<1, 1, 0, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 0, 1, 3, 0>,
-            Conv::template process_tile<1, 1, 0, 1, 3, 1>,
-            Conv::template process_tile<1, 1, 0, 1, 3, 2>,
-            Conv::template process_tile<1, 1, 0, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 2, 0, 2>,
-            Conv::template process_tile<1, 1, 0, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 2, 1, 2>,
-            Conv::template process_tile<1, 1, 0, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 2, 2, 2>,
-            Conv::template process_tile<1, 1, 0, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 0, 2, 3, 0>,
-            Conv::template process_tile<1, 1, 0, 2, 3, 1>,
-            Conv::template process_tile<1, 1, 0, 2, 3, 2>,
-            Conv::template process_tile<1, 1, 0, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 3, 0, 2>,
-            Conv::template process_tile<1, 1, 0, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 3, 1, 2>,
-            Conv::template process_tile<1, 1, 0, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 3, 2, 2>,
-            Conv::template process_tile<1, 1, 0, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 0, 3, 3, 0>,
-            Conv::template process_tile<1, 1, 0, 3, 3, 1>,
-            Conv::template process_tile<1, 1, 0, 3, 3, 2>,
-            Conv::template process_tile<1, 1, 0, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 4, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 4, 0, 2>,
-            Conv::template process_tile<1, 1, 0, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 4, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 4, 1, 2>,
-            Conv::template process_tile<1, 1, 0, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 4, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 4, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 4, 2, 2>,
-            Conv::template process_tile<1, 1, 0, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 0, 4, 3, 0>,
-            Conv::template process_tile<1, 1, 0, 4, 3, 1>,
-            Conv::template process_tile<1, 1, 0, 4, 3, 2>,
-            Conv::template process_tile<1, 1, 0, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 5, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 5, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 5, 0, 2>,
-            Conv::template process_tile<1, 1, 0, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 5, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 5, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 5, 1, 2>,
-            Conv::template process_tile<1, 1, 0, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 5, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 5, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 5, 2, 2>,
-            Conv::template process_tile<1, 1, 0, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 0, 5, 3, 0>,
-            Conv::template process_tile<1, 1, 0, 5, 3, 1>,
-            Conv::template process_tile<1, 1, 0, 5, 3, 2>,
-            Conv::template process_tile<1, 1, 0, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 0, 6, 0, 0>,
-            Conv::template process_tile<1, 1, 0, 6, 0, 1>,
-            Conv::template process_tile<1, 1, 0, 6, 0, 2>,
-            Conv::template process_tile<1, 1, 0, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 0, 6, 1, 0>,
-            Conv::template process_tile<1, 1, 0, 6, 1, 1>,
-            Conv::template process_tile<1, 1, 0, 6, 1, 2>,
-            Conv::template process_tile<1, 1, 0, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 0, 6, 2, 0>,
-            Conv::template process_tile<1, 1, 0, 6, 2, 1>,
-            Conv::template process_tile<1, 1, 0, 6, 2, 2>,
-            Conv::template process_tile<1, 1, 0, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 0, 6, 3, 0>,
-            Conv::template process_tile<1, 1, 0, 6, 3, 1>,
-            Conv::template process_tile<1, 1, 0, 6, 3, 2>,
-            Conv::template process_tile<1, 1, 0, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 0
-      {  // Input pad bottom = 1
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 0, 0, 2>,
-            Conv::template process_tile<1, 1, 1, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 0, 1, 2>,
-            Conv::template process_tile<1, 1, 1, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 0, 2, 2>,
-            Conv::template process_tile<1, 1, 1, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 1, 0, 3, 0>,
-            Conv::template process_tile<1, 1, 1, 0, 3, 1>,
-            Conv::template process_tile<1, 1, 1, 0, 3, 2>,
-            Conv::template process_tile<1, 1, 1, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 1, 0, 2>,
-            Conv::template process_tile<1, 1, 1, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 1, 1, 2>,
-            Conv::template process_tile<1, 1, 1, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 1, 2, 2>,
-            Conv::template process_tile<1, 1, 1, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 1, 1, 3, 0>,
-            Conv::template process_tile<1, 1, 1, 1, 3, 1>,
-            Conv::template process_tile<1, 1, 1, 1, 3, 2>,
-            Conv::template process_tile<1, 1, 1, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 2, 0, 2>,
-            Conv::template process_tile<1, 1, 1, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 2, 1, 2>,
-            Conv::template process_tile<1, 1, 1, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 2, 2, 2>,
-            Conv::template process_tile<1, 1, 1, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 1, 2, 3, 0>,
-            Conv::template process_tile<1, 1, 1, 2, 3, 1>,
-            Conv::template process_tile<1, 1, 1, 2, 3, 2>,
-            Conv::template process_tile<1, 1, 1, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 3, 0, 2>,
-            Conv::template process_tile<1, 1, 1, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 3, 1, 2>,
-            Conv::template process_tile<1, 1, 1, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 3, 2, 2>,
-            Conv::template process_tile<1, 1, 1, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 1, 3, 3, 0>,
-            Conv::template process_tile<1, 1, 1, 3, 3, 1>,
-            Conv::template process_tile<1, 1, 1, 3, 3, 2>,
-            Conv::template process_tile<1, 1, 1, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 4, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 4, 0, 2>,
-            Conv::template process_tile<1, 1, 1, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 4, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 4, 1, 2>,
-            Conv::template process_tile<1, 1, 1, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 4, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 4, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 4, 2, 2>,
-            Conv::template process_tile<1, 1, 1, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 1, 4, 3, 0>,
-            Conv::template process_tile<1, 1, 1, 4, 3, 1>,
-            Conv::template process_tile<1, 1, 1, 4, 3, 2>,
-            Conv::template process_tile<1, 1, 1, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 5, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 5, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 5, 0, 2>,
-            Conv::template process_tile<1, 1, 1, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 5, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 5, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 5, 1, 2>,
-            Conv::template process_tile<1, 1, 1, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 5, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 5, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 5, 2, 2>,
-            Conv::template process_tile<1, 1, 1, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 1, 5, 3, 0>,
-            Conv::template process_tile<1, 1, 1, 5, 3, 1>,
-            Conv::template process_tile<1, 1, 1, 5, 3, 2>,
-            Conv::template process_tile<1, 1, 1, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 1, 6, 0, 0>,
-            Conv::template process_tile<1, 1, 1, 6, 0, 1>,
-            Conv::template process_tile<1, 1, 1, 6, 0, 2>,
-            Conv::template process_tile<1, 1, 1, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 1, 6, 1, 0>,
-            Conv::template process_tile<1, 1, 1, 6, 1, 1>,
-            Conv::template process_tile<1, 1, 1, 6, 1, 2>,
-            Conv::template process_tile<1, 1, 1, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 1, 6, 2, 0>,
-            Conv::template process_tile<1, 1, 1, 6, 2, 1>,
-            Conv::template process_tile<1, 1, 1, 6, 2, 2>,
-            Conv::template process_tile<1, 1, 1, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 1, 6, 3, 0>,
-            Conv::template process_tile<1, 1, 1, 6, 3, 1>,
-            Conv::template process_tile<1, 1, 1, 6, 3, 2>,
-            Conv::template process_tile<1, 1, 1, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 1
-      {  // Input pad bottom = 2
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 0, 0, 2>,
-            Conv::template process_tile<1, 1, 2, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 0, 1, 2>,
-            Conv::template process_tile<1, 1, 2, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 0, 2, 2>,
-            Conv::template process_tile<1, 1, 2, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 2, 0, 3, 0>,
-            Conv::template process_tile<1, 1, 2, 0, 3, 1>,
-            Conv::template process_tile<1, 1, 2, 0, 3, 2>,
-            Conv::template process_tile<1, 1, 2, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 1, 0, 2>,
-            Conv::template process_tile<1, 1, 2, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 1, 1, 2>,
-            Conv::template process_tile<1, 1, 2, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 1, 2, 2>,
-            Conv::template process_tile<1, 1, 2, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 2, 1, 3, 0>,
-            Conv::template process_tile<1, 1, 2, 1, 3, 1>,
-            Conv::template process_tile<1, 1, 2, 1, 3, 2>,
-            Conv::template process_tile<1, 1, 2, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 2, 0, 2>,
-            Conv::template process_tile<1, 1, 2, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 2, 1, 2>,
-            Conv::template process_tile<1, 1, 2, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 2, 2, 2>,
-            Conv::template process_tile<1, 1, 2, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 2, 2, 3, 0>,
-            Conv::template process_tile<1, 1, 2, 2, 3, 1>,
-            Conv::template process_tile<1, 1, 2, 2, 3, 2>,
-            Conv::template process_tile<1, 1, 2, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 3, 0, 2>,
-            Conv::template process_tile<1, 1, 2, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 3, 1, 2>,
-            Conv::template process_tile<1, 1, 2, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 3, 2, 2>,
-            Conv::template process_tile<1, 1, 2, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 2, 3, 3, 0>,
-            Conv::template process_tile<1, 1, 2, 3, 3, 1>,
-            Conv::template process_tile<1, 1, 2, 3, 3, 2>,
-            Conv::template process_tile<1, 1, 2, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 4, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 4, 0, 2>,
-            Conv::template process_tile<1, 1, 2, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 4, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 4, 1, 2>,
-            Conv::template process_tile<1, 1, 2, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 4, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 4, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 4, 2, 2>,
-            Conv::template process_tile<1, 1, 2, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 2, 4, 3, 0>,
-            Conv::template process_tile<1, 1, 2, 4, 3, 1>,
-            Conv::template process_tile<1, 1, 2, 4, 3, 2>,
-            Conv::template process_tile<1, 1, 2, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 5, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 5, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 5, 0, 2>,
-            Conv::template process_tile<1, 1, 2, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 5, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 5, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 5, 1, 2>,
-            Conv::template process_tile<1, 1, 2, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 5, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 5, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 5, 2, 2>,
-            Conv::template process_tile<1, 1, 2, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 2, 5, 3, 0>,
-            Conv::template process_tile<1, 1, 2, 5, 3, 1>,
-            Conv::template process_tile<1, 1, 2, 5, 3, 2>,
-            Conv::template process_tile<1, 1, 2, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 2, 6, 0, 0>,
-            Conv::template process_tile<1, 1, 2, 6, 0, 1>,
-            Conv::template process_tile<1, 1, 2, 6, 0, 2>,
-            Conv::template process_tile<1, 1, 2, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 2, 6, 1, 0>,
-            Conv::template process_tile<1, 1, 2, 6, 1, 1>,
-            Conv::template process_tile<1, 1, 2, 6, 1, 2>,
-            Conv::template process_tile<1, 1, 2, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 2, 6, 2, 0>,
-            Conv::template process_tile<1, 1, 2, 6, 2, 1>,
-            Conv::template process_tile<1, 1, 2, 6, 2, 2>,
-            Conv::template process_tile<1, 1, 2, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 2, 6, 3, 0>,
-            Conv::template process_tile<1, 1, 2, 6, 3, 1>,
-            Conv::template process_tile<1, 1, 2, 6, 3, 2>,
-            Conv::template process_tile<1, 1, 2, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 2
-      {  // Input pad bottom = 3
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 0, 0, 2>,
-            Conv::template process_tile<1, 1, 3, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 0, 1, 2>,
-            Conv::template process_tile<1, 1, 3, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 0, 2, 2>,
-            Conv::template process_tile<1, 1, 3, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 3, 0, 3, 0>,
-            Conv::template process_tile<1, 1, 3, 0, 3, 1>,
-            Conv::template process_tile<1, 1, 3, 0, 3, 2>,
-            Conv::template process_tile<1, 1, 3, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 1, 0, 2>,
-            Conv::template process_tile<1, 1, 3, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 1, 1, 2>,
-            Conv::template process_tile<1, 1, 3, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 1, 2, 2>,
-            Conv::template process_tile<1, 1, 3, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 3, 1, 3, 0>,
-            Conv::template process_tile<1, 1, 3, 1, 3, 1>,
-            Conv::template process_tile<1, 1, 3, 1, 3, 2>,
-            Conv::template process_tile<1, 1, 3, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 2, 0, 2>,
-            Conv::template process_tile<1, 1, 3, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 2, 1, 2>,
-            Conv::template process_tile<1, 1, 3, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 2, 2, 2>,
-            Conv::template process_tile<1, 1, 3, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 3, 2, 3, 0>,
-            Conv::template process_tile<1, 1, 3, 2, 3, 1>,
-            Conv::template process_tile<1, 1, 3, 2, 3, 2>,
-            Conv::template process_tile<1, 1, 3, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 3, 0, 2>,
-            Conv::template process_tile<1, 1, 3, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 3, 1, 2>,
-            Conv::template process_tile<1, 1, 3, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 3, 2, 2>,
-            Conv::template process_tile<1, 1, 3, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 3, 3, 3, 0>,
-            Conv::template process_tile<1, 1, 3, 3, 3, 1>,
-            Conv::template process_tile<1, 1, 3, 3, 3, 2>,
-            Conv::template process_tile<1, 1, 3, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 4, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 4, 0, 2>,
-            Conv::template process_tile<1, 1, 3, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 4, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 4, 1, 2>,
-            Conv::template process_tile<1, 1, 3, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 4, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 4, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 4, 2, 2>,
-            Conv::template process_tile<1, 1, 3, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 3, 4, 3, 0>,
-            Conv::template process_tile<1, 1, 3, 4, 3, 1>,
-            Conv::template process_tile<1, 1, 3, 4, 3, 2>,
-            Conv::template process_tile<1, 1, 3, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 5, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 5, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 5, 0, 2>,
-            Conv::template process_tile<1, 1, 3, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 5, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 5, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 5, 1, 2>,
-            Conv::template process_tile<1, 1, 3, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 5, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 5, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 5, 2, 2>,
-            Conv::template process_tile<1, 1, 3, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 3, 5, 3, 0>,
-            Conv::template process_tile<1, 1, 3, 5, 3, 1>,
-            Conv::template process_tile<1, 1, 3, 5, 3, 2>,
-            Conv::template process_tile<1, 1, 3, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 3, 6, 0, 0>,
-            Conv::template process_tile<1, 1, 3, 6, 0, 1>,
-            Conv::template process_tile<1, 1, 3, 6, 0, 2>,
-            Conv::template process_tile<1, 1, 3, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 3, 6, 1, 0>,
-            Conv::template process_tile<1, 1, 3, 6, 1, 1>,
-            Conv::template process_tile<1, 1, 3, 6, 1, 2>,
-            Conv::template process_tile<1, 1, 3, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 3, 6, 2, 0>,
-            Conv::template process_tile<1, 1, 3, 6, 2, 1>,
-            Conv::template process_tile<1, 1, 3, 6, 2, 2>,
-            Conv::template process_tile<1, 1, 3, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 3, 6, 3, 0>,
-            Conv::template process_tile<1, 1, 3, 6, 3, 1>,
-            Conv::template process_tile<1, 1, 3, 6, 3, 2>,
-            Conv::template process_tile<1, 1, 3, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 3
-      {  // Input pad bottom = 4
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 4, 0, 0, 2>,
-            Conv::template process_tile<1, 1, 4, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 4, 0, 1, 2>,
-            Conv::template process_tile<1, 1, 4, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 4, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 4, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 4, 0, 2, 2>,
-            Conv::template process_tile<1, 1, 4, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 4, 0, 3, 0>,
-            Conv::template process_tile<1, 1, 4, 0, 3, 1>,
-            Conv::template process_tile<1, 1, 4, 0, 3, 2>,
-            Conv::template process_tile<1, 1, 4, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 4, 1, 0, 2>,
-            Conv::template process_tile<1, 1, 4, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 4, 1, 1, 2>,
-            Conv::template process_tile<1, 1, 4, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 4, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 4, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 4, 1, 2, 2>,
-            Conv::template process_tile<1, 1, 4, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 4, 1, 3, 0>,
-            Conv::template process_tile<1, 1, 4, 1, 3, 1>,
-            Conv::template process_tile<1, 1, 4, 1, 3, 2>,
-            Conv::template process_tile<1, 1, 4, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 4, 2, 0, 2>,
-            Conv::template process_tile<1, 1, 4, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 4, 2, 1, 2>,
-            Conv::template process_tile<1, 1, 4, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 4, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 4, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 4, 2, 2, 2>,
-            Conv::template process_tile<1, 1, 4, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 4, 2, 3, 0>,
-            Conv::template process_tile<1, 1, 4, 2, 3, 1>,
-            Conv::template process_tile<1, 1, 4, 2, 3, 2>,
-            Conv::template process_tile<1, 1, 4, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 4, 3, 0, 2>,
-            Conv::template process_tile<1, 1, 4, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 4, 3, 1, 2>,
-            Conv::template process_tile<1, 1, 4, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 4, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 4, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 4, 3, 2, 2>,
-            Conv::template process_tile<1, 1, 4, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 4, 3, 3, 0>,
-            Conv::template process_tile<1, 1, 4, 3, 3, 1>,
-            Conv::template process_tile<1, 1, 4, 3, 3, 2>,
-            Conv::template process_tile<1, 1, 4, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 4, 0, 1>,
-            Conv::template process_tile<1, 1, 4, 4, 0, 2>,
-            Conv::template process_tile<1, 1, 4, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 4, 1, 1>,
-            Conv::template process_tile<1, 1, 4, 4, 1, 2>,
-            Conv::template process_tile<1, 1, 4, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 4, 4, 2, 0>,
-            Conv::template process_tile<1, 1, 4, 4, 2, 1>,
-            Conv::template process_tile<1, 1, 4, 4, 2, 2>,
-            Conv::template process_tile<1, 1, 4, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 4, 4, 3, 0>,
-            Conv::template process_tile<1, 1, 4, 4, 3, 1>,
-            Conv::template process_tile<1, 1, 4, 4, 3, 2>,
-            Conv::template process_tile<1, 1, 4, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 5, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 5, 0, 1>,
-            Conv::template process_tile<1, 1, 4, 5, 0, 2>,
-            Conv::template process_tile<1, 1, 4, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 5, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 5, 1, 1>,
-            Conv::template process_tile<1, 1, 4, 5, 1, 2>,
-            Conv::template process_tile<1, 1, 4, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 4, 5, 2, 0>,
-            Conv::template process_tile<1, 1, 4, 5, 2, 1>,
-            Conv::template process_tile<1, 1, 4, 5, 2, 2>,
-            Conv::template process_tile<1, 1, 4, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 4, 5, 3, 0>,
-            Conv::template process_tile<1, 1, 4, 5, 3, 1>,
-            Conv::template process_tile<1, 1, 4, 5, 3, 2>,
-            Conv::template process_tile<1, 1, 4, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 4, 6, 0, 0>,
-            Conv::template process_tile<1, 1, 4, 6, 0, 1>,
-            Conv::template process_tile<1, 1, 4, 6, 0, 2>,
-            Conv::template process_tile<1, 1, 4, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 4, 6, 1, 0>,
-            Conv::template process_tile<1, 1, 4, 6, 1, 1>,
-            Conv::template process_tile<1, 1, 4, 6, 1, 2>,
-            Conv::template process_tile<1, 1, 4, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 4, 6, 2, 0>,
-            Conv::template process_tile<1, 1, 4, 6, 2, 1>,
-            Conv::template process_tile<1, 1, 4, 6, 2, 2>,
-            Conv::template process_tile<1, 1, 4, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 4, 6, 3, 0>,
-            Conv::template process_tile<1, 1, 4, 6, 3, 1>,
-            Conv::template process_tile<1, 1, 4, 6, 3, 2>,
-            Conv::template process_tile<1, 1, 4, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 4
-      {  // Input pad bottom = 5
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 5, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 5, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 5, 0, 0, 2>,
-            Conv::template process_tile<1, 1, 5, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 5, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 5, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 5, 0, 1, 2>,
-            Conv::template process_tile<1, 1, 5, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 5, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 5, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 5, 0, 2, 2>,
-            Conv::template process_tile<1, 1, 5, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 5, 0, 3, 0>,
-            Conv::template process_tile<1, 1, 5, 0, 3, 1>,
-            Conv::template process_tile<1, 1, 5, 0, 3, 2>,
-            Conv::template process_tile<1, 1, 5, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 5, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 5, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 5, 1, 0, 2>,
-            Conv::template process_tile<1, 1, 5, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 5, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 5, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 5, 1, 1, 2>,
-            Conv::template process_tile<1, 1, 5, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 5, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 5, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 5, 1, 2, 2>,
-            Conv::template process_tile<1, 1, 5, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 5, 1, 3, 0>,
-            Conv::template process_tile<1, 1, 5, 1, 3, 1>,
-            Conv::template process_tile<1, 1, 5, 1, 3, 2>,
-            Conv::template process_tile<1, 1, 5, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 5, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 5, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 5, 2, 0, 2>,
-            Conv::template process_tile<1, 1, 5, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 5, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 5, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 5, 2, 1, 2>,
-            Conv::template process_tile<1, 1, 5, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 5, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 5, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 5, 2, 2, 2>,
-            Conv::template process_tile<1, 1, 5, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 5, 2, 3, 0>,
-            Conv::template process_tile<1, 1, 5, 2, 3, 1>,
-            Conv::template process_tile<1, 1, 5, 2, 3, 2>,
-            Conv::template process_tile<1, 1, 5, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 5, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 5, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 5, 3, 0, 2>,
-            Conv::template process_tile<1, 1, 5, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 5, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 5, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 5, 3, 1, 2>,
-            Conv::template process_tile<1, 1, 5, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 5, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 5, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 5, 3, 2, 2>,
-            Conv::template process_tile<1, 1, 5, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 5, 3, 3, 0>,
-            Conv::template process_tile<1, 1, 5, 3, 3, 1>,
-            Conv::template process_tile<1, 1, 5, 3, 3, 2>,
-            Conv::template process_tile<1, 1, 5, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 5, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 5, 4, 0, 1>,
-            Conv::template process_tile<1, 1, 5, 4, 0, 2>,
-            Conv::template process_tile<1, 1, 5, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 5, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 5, 4, 1, 1>,
-            Conv::template process_tile<1, 1, 5, 4, 1, 2>,
-            Conv::template process_tile<1, 1, 5, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 5, 4, 2, 0>,
-            Conv::template process_tile<1, 1, 5, 4, 2, 1>,
-            Conv::template process_tile<1, 1, 5, 4, 2, 2>,
-            Conv::template process_tile<1, 1, 5, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 5, 4, 3, 0>,
-            Conv::template process_tile<1, 1, 5, 4, 3, 1>,
-            Conv::template process_tile<1, 1, 5, 4, 3, 2>,
-            Conv::template process_tile<1, 1, 5, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 5, 5, 0, 0>,
-            Conv::template process_tile<1, 1, 5, 5, 0, 1>,
-            Conv::template process_tile<1, 1, 5, 5, 0, 2>,
-            Conv::template process_tile<1, 1, 5, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 5, 5, 1, 0>,
-            Conv::template process_tile<1, 1, 5, 5, 1, 1>,
-            Conv::template process_tile<1, 1, 5, 5, 1, 2>,
-            Conv::template process_tile<1, 1, 5, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 5, 5, 2, 0>,
-            Conv::template process_tile<1, 1, 5, 5, 2, 1>,
-            Conv::template process_tile<1, 1, 5, 5, 2, 2>,
-            Conv::template process_tile<1, 1, 5, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 5, 5, 3, 0>,
-            Conv::template process_tile<1, 1, 5, 5, 3, 1>,
-            Conv::template process_tile<1, 1, 5, 5, 3, 2>,
-            Conv::template process_tile<1, 1, 5, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 5, 6, 0, 0>,
-            Conv::template process_tile<1, 1, 5, 6, 0, 1>,
-            Conv::template process_tile<1, 1, 5, 6, 0, 2>,
-            Conv::template process_tile<1, 1, 5, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 5, 6, 1, 0>,
-            Conv::template process_tile<1, 1, 5, 6, 1, 1>,
-            Conv::template process_tile<1, 1, 5, 6, 1, 2>,
-            Conv::template process_tile<1, 1, 5, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 5, 6, 2, 0>,
-            Conv::template process_tile<1, 1, 5, 6, 2, 1>,
-            Conv::template process_tile<1, 1, 5, 6, 2, 2>,
-            Conv::template process_tile<1, 1, 5, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 5, 6, 3, 0>,
-            Conv::template process_tile<1, 1, 5, 6, 3, 1>,
-            Conv::template process_tile<1, 1, 5, 6, 3, 2>,
-            Conv::template process_tile<1, 1, 5, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 5
-      {  // Input pad bottom = 6
-        {  // Input pad right = 0
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 6, 0, 0, 0>,
-            Conv::template process_tile<1, 1, 6, 0, 0, 1>,
-            Conv::template process_tile<1, 1, 6, 0, 0, 2>,
-            Conv::template process_tile<1, 1, 6, 0, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 6, 0, 1, 0>,
-            Conv::template process_tile<1, 1, 6, 0, 1, 1>,
-            Conv::template process_tile<1, 1, 6, 0, 1, 2>,
-            Conv::template process_tile<1, 1, 6, 0, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 6, 0, 2, 0>,
-            Conv::template process_tile<1, 1, 6, 0, 2, 1>,
-            Conv::template process_tile<1, 1, 6, 0, 2, 2>,
-            Conv::template process_tile<1, 1, 6, 0, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 6, 0, 3, 0>,
-            Conv::template process_tile<1, 1, 6, 0, 3, 1>,
-            Conv::template process_tile<1, 1, 6, 0, 3, 2>,
-            Conv::template process_tile<1, 1, 6, 0, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 0
-        {  // Input pad right = 1
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 6, 1, 0, 0>,
-            Conv::template process_tile<1, 1, 6, 1, 0, 1>,
-            Conv::template process_tile<1, 1, 6, 1, 0, 2>,
-            Conv::template process_tile<1, 1, 6, 1, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 6, 1, 1, 0>,
-            Conv::template process_tile<1, 1, 6, 1, 1, 1>,
-            Conv::template process_tile<1, 1, 6, 1, 1, 2>,
-            Conv::template process_tile<1, 1, 6, 1, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 6, 1, 2, 0>,
-            Conv::template process_tile<1, 1, 6, 1, 2, 1>,
-            Conv::template process_tile<1, 1, 6, 1, 2, 2>,
-            Conv::template process_tile<1, 1, 6, 1, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 6, 1, 3, 0>,
-            Conv::template process_tile<1, 1, 6, 1, 3, 1>,
-            Conv::template process_tile<1, 1, 6, 1, 3, 2>,
-            Conv::template process_tile<1, 1, 6, 1, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 1
-        {  // Input pad right = 2
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 6, 2, 0, 0>,
-            Conv::template process_tile<1, 1, 6, 2, 0, 1>,
-            Conv::template process_tile<1, 1, 6, 2, 0, 2>,
-            Conv::template process_tile<1, 1, 6, 2, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 6, 2, 1, 0>,
-            Conv::template process_tile<1, 1, 6, 2, 1, 1>,
-            Conv::template process_tile<1, 1, 6, 2, 1, 2>,
-            Conv::template process_tile<1, 1, 6, 2, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 6, 2, 2, 0>,
-            Conv::template process_tile<1, 1, 6, 2, 2, 1>,
-            Conv::template process_tile<1, 1, 6, 2, 2, 2>,
-            Conv::template process_tile<1, 1, 6, 2, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 6, 2, 3, 0>,
-            Conv::template process_tile<1, 1, 6, 2, 3, 1>,
-            Conv::template process_tile<1, 1, 6, 2, 3, 2>,
-            Conv::template process_tile<1, 1, 6, 2, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 2
-        {  // Input pad right = 3
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 6, 3, 0, 0>,
-            Conv::template process_tile<1, 1, 6, 3, 0, 1>,
-            Conv::template process_tile<1, 1, 6, 3, 0, 2>,
-            Conv::template process_tile<1, 1, 6, 3, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 6, 3, 1, 0>,
-            Conv::template process_tile<1, 1, 6, 3, 1, 1>,
-            Conv::template process_tile<1, 1, 6, 3, 1, 2>,
-            Conv::template process_tile<1, 1, 6, 3, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 6, 3, 2, 0>,
-            Conv::template process_tile<1, 1, 6, 3, 2, 1>,
-            Conv::template process_tile<1, 1, 6, 3, 2, 2>,
-            Conv::template process_tile<1, 1, 6, 3, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 6, 3, 3, 0>,
-            Conv::template process_tile<1, 1, 6, 3, 3, 1>,
-            Conv::template process_tile<1, 1, 6, 3, 3, 2>,
-            Conv::template process_tile<1, 1, 6, 3, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 3
-        {  // Input pad right = 4
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 6, 4, 0, 0>,
-            Conv::template process_tile<1, 1, 6, 4, 0, 1>,
-            Conv::template process_tile<1, 1, 6, 4, 0, 2>,
-            Conv::template process_tile<1, 1, 6, 4, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 6, 4, 1, 0>,
-            Conv::template process_tile<1, 1, 6, 4, 1, 1>,
-            Conv::template process_tile<1, 1, 6, 4, 1, 2>,
-            Conv::template process_tile<1, 1, 6, 4, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 6, 4, 2, 0>,
-            Conv::template process_tile<1, 1, 6, 4, 2, 1>,
-            Conv::template process_tile<1, 1, 6, 4, 2, 2>,
-            Conv::template process_tile<1, 1, 6, 4, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 6, 4, 3, 0>,
-            Conv::template process_tile<1, 1, 6, 4, 3, 1>,
-            Conv::template process_tile<1, 1, 6, 4, 3, 2>,
-            Conv::template process_tile<1, 1, 6, 4, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 4
-        {  // Input pad right = 5
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 6, 5, 0, 0>,
-            Conv::template process_tile<1, 1, 6, 5, 0, 1>,
-            Conv::template process_tile<1, 1, 6, 5, 0, 2>,
-            Conv::template process_tile<1, 1, 6, 5, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 6, 5, 1, 0>,
-            Conv::template process_tile<1, 1, 6, 5, 1, 1>,
-            Conv::template process_tile<1, 1, 6, 5, 1, 2>,
-            Conv::template process_tile<1, 1, 6, 5, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 6, 5, 2, 0>,
-            Conv::template process_tile<1, 1, 6, 5, 2, 1>,
-            Conv::template process_tile<1, 1, 6, 5, 2, 2>,
-            Conv::template process_tile<1, 1, 6, 5, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 6, 5, 3, 0>,
-            Conv::template process_tile<1, 1, 6, 5, 3, 1>,
-            Conv::template process_tile<1, 1, 6, 5, 3, 2>,
-            Conv::template process_tile<1, 1, 6, 5, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 5
-        {  // Input pad right = 6
-          {  // Output pad bottom = 0
-            Conv::template process_tile<1, 1, 6, 6, 0, 0>,
-            Conv::template process_tile<1, 1, 6, 6, 0, 1>,
-            Conv::template process_tile<1, 1, 6, 6, 0, 2>,
-            Conv::template process_tile<1, 1, 6, 6, 0, 3>,
-          },  // Output pad bottom = 0
-          {  // Output pad bottom = 1
-            Conv::template process_tile<1, 1, 6, 6, 1, 0>,
-            Conv::template process_tile<1, 1, 6, 6, 1, 1>,
-            Conv::template process_tile<1, 1, 6, 6, 1, 2>,
-            Conv::template process_tile<1, 1, 6, 6, 1, 3>,
-          },  // Output pad bottom = 1
-          {  // Output pad bottom = 2
-            Conv::template process_tile<1, 1, 6, 6, 2, 0>,
-            Conv::template process_tile<1, 1, 6, 6, 2, 1>,
-            Conv::template process_tile<1, 1, 6, 6, 2, 2>,
-            Conv::template process_tile<1, 1, 6, 6, 2, 3>,
-          },  // Output pad bottom = 2
-          {  // Output pad bottom = 3
-            Conv::template process_tile<1, 1, 6, 6, 3, 0>,
-            Conv::template process_tile<1, 1, 6, 6, 3, 1>,
-            Conv::template process_tile<1, 1, 6, 6, 3, 2>,
-            Conv::template process_tile<1, 1, 6, 6, 3, 3>,
-          },  // Output pad bottom = 3
-        },  // Input pad right = 6
-      },  // Input pad bottom = 6
-    },  // Input pad left = 1
-  },  // Input pad top = 1
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+  ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+  ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
 };
 
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+  ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+  ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 6, 0, 3, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 7, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 7, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 7, 0, 2, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 7, 0, 3, 0>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 8, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 8, 0, 1, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 8, 0, 2, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 8, 0, 3, 0>,
+  },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 3>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 2>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 3>,
+  },
+  {
+    ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 0>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 1>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 2>,
+    ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 3>,
+  },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
 
 template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float, float>;
 }  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
index a95ce0e..3b3cda0 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp

@@ -86,148 +86,288 @@
   const float *inptr = matrix_base;
   const float *bptr = biases;
 
-  // For each channel of the output
-  int channels_remaining = n_channels;
-#ifdef __aarch64__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
+  if (bptr)
   {
-    // Matrices used and computed during this transform
-    float32x4_t F[4][4], FZ[4][2], f[2][2], b;
-
-    // Read a 4x4 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 4; i++)
+    // For each channel of the output
+    int channels_remaining = n_channels;
+#ifdef __aarch64__
+    for (; channels_remaining >= 4; channels_remaining -= 4)
     {
-      for (int j = 0; j < 4; j++, m++)
+      // Matrices used and computed during this transform
+      float32x4_t F[4][4], FZ[4][2], f[2][2], b;
+
+      // Read a 4x4 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 4; i++)
       {
-        F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        for (int j = 0; j < 4; j++, m++)
+        {
+          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 4;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 4; i++)
+      {
+        // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+        FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
+
+        // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+        FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+        f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+        // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+        f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+      }
+
+      // Load the bias vector
+      b = vld1q_f32(bptr);
+      bptr += 4;
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
+          outptrs[i][j] += 4;
+        }
       }
     }
-    inptr += 4;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 4; i++)
-    {
-      // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
-      FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
-
-      // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
-      FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
-      f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
-
-      // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
-      f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
-    }
-
-    // Load the bias vector
-    b = vld1q_f32(bptr);
-    bptr += 4;
-
-    // Write out the output tile
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
-      {
-        vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
-        outptrs[i][j] += 4;
-      }
-    }
-  }
 #endif  // __aarch64__
 #ifdef __arm_any__
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used and computed during this transform
-    float32x2_t F[4][4], FZ[4][2], f[2][2], b;
-
-    // Read a 4x4 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 4; i++)
+    for (; channels_remaining >= 2; channels_remaining -= 2)
     {
-      for (int j = 0; j < 4; j++, m++)
+      // Matrices used and computed during this transform
+      float32x2_t F[4][4], FZ[4][2], f[2][2], b;
+
+      // Read a 4x4 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 4; i++)
       {
-        F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        for (int j = 0; j < 4; j++, m++)
+        {
+          F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 2;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 4; i++)
+      {
+        // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+        FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
+
+        // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+        FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+        f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+        // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+        f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+      }
+
+      // Load the bias vector
+      b = vld1_f32(bptr);
+      bptr += 2;
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+          outptrs[i][j] += 2;
+        }
       }
     }
-    inptr += 2;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 4; i++)
+#endif  // __arm_any__
+    for (; channels_remaining; channels_remaining--)
     {
-      // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
-      FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
+      // Matrices used and computed during this transform
+      float F[4][4], FZ[4][2], f[2][2], b;
 
-      // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
-      FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
-      f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
-
-      // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
-      f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
-    }
-
-    // Load the bias vector
-    b = vld1_f32(bptr);
-    bptr += 2;
-
-    // Write out the output tile
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
+      // Read a 4x4 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 4; i++)
       {
-        vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
-        outptrs[i][j] += 2;
+        for (int j = 0; j < 4; j++, m++)
+        {
+          F[i][j] = *(inptr + m*matrix_stride);
+        }
+      }
+      inptr++;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 4; i++)
+      {
+        FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+        FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+        f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+      }
+
+      // Load the bias
+      b = *(bptr++);
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          *(outptrs[i][j]++) = f[i][j] + b;
+        }
       }
     }
   }
-#endif  // __arm_any__
-  for (; channels_remaining; channels_remaining--)
+  else
   {
-    // Matrices used and computed during this transform
-    float F[4][4], FZ[4][2], f[2][2], b;
-
-    // Read a 4x4 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 4; i++)
+    // For each channel of the output
+    int channels_remaining = n_channels;
+#ifdef __aarch64__
+    for (; channels_remaining >= 4; channels_remaining -= 4)
     {
-      for (int j = 0; j < 4; j++, m++)
+      // Matrices used and computed during this transform
+      float32x4_t F[4][4], FZ[4][2], f[2][2];
+
+      // Read a 4x4 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 4; i++)
       {
-        F[i][j] = *(inptr + m*matrix_stride);
+        for (int j = 0; j < 4; j++, m++)
+        {
+          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 4;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 4; i++)
+      {
+        // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+        FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
+
+        // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+        FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+        f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+        // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+        f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1q_f32(outptrs[i][j], f[i][j]);
+          outptrs[i][j] += 4;
+        }
       }
     }
-    inptr++;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 4; i++)
+#endif  // __aarch64__
+#ifdef __arm_any__
+    for (; channels_remaining >= 2; channels_remaining -= 2)
     {
-      FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
-      FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
-    }
+      // Matrices used and computed during this transform
+      float32x2_t F[4][4], FZ[4][2], f[2][2];
 
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
-      f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
-    }
-
-    // Load the bias
-    b = *(bptr++);
-
-    // Write out the output tile
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
+      // Read a 4x4 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 4; i++)
       {
-        *(outptrs[i][j]++) = f[i][j] + b;
+        for (int j = 0; j < 4; j++, m++)
+        {
+          F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 2;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 4; i++)
+      {
+        // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+        FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
+
+        // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+        FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+        f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+        // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+        f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1_f32(outptrs[i][j], f[i][j]);
+          outptrs[i][j] += 2;
+        }
+      }
+    }
+#endif  // __arm_any__
+    for (; channels_remaining; channels_remaining--)
+    {
+      // Matrices used and computed during this transform
+      float F[4][4], FZ[4][2], f[2][2];
+
+      // Read a 4x4 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 4; i++)
+      {
+        for (int j = 0; j < 4; j++, m++)
+        {
+          F[i][j] = *(inptr + m*matrix_stride);
+        }
+      }
+      inptr++;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 4; i++)
+      {
+        FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+        FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+        f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          *(outptrs[i][j]++) = f[i][j];
+        }
       }
     }
   }

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
index 6bb1674..8668535 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp

@@ -35,6 +35,7 @@
 template <>
 int Transform::ops_performed(const Tensor4DShape &shape)
 {
+  (void) shape;
   return 0;
 }
 
@@ -83,142 +84,282 @@
   const float *inptr = matrix_base;
   const float *bptr = biases;
 
-  // For each channel of the output
-  int channels_remaining = n_channels;
-#ifdef __aarch64__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
+  if (bptr)
   {
-    // Matrices used and computed during this transform
-    float32x4_t F[6][6], FZ[6][2], f[2][2], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
+    // For each channel of the output
+    int channels_remaining = n_channels;
+#ifdef __aarch64__
+    for (; channels_remaining >= 4; channels_remaining -= 4)
     {
-      for (int j = 0; j < 6; j++, m++)
+      // Matrices used and computed during this transform
+      float32x4_t F[6][6], FZ[6][2], f[2][2], b;
+
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 4;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+        // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+        FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+        // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+        f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+      }
+
+      // Write out the output tile
+      b = vld1q_f32(bptr);
+      bptr += 4;
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
+          outptrs[i][j] += 4;
+        }
       }
     }
-    inptr += 4;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
-    {
-      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
-
-      // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
-      FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-      // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
-      f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
-    }
-
-    // Write out the output tile
-    b = vld1q_f32(bptr);
-    bptr += 4;
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
-      {
-        vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
-        outptrs[i][j] += 4;
-      }
-    }
-  }
 #endif  // __aarch64__
 #ifdef __arm_any__
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used and computed during this transform
-    float32x2_t F[6][6], FZ[6][2], f[2][2], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
+    for (; channels_remaining >= 2; channels_remaining -= 2)
     {
-      for (int j = 0; j < 6; j++, m++)
+      // Matrices used and computed during this transform
+      float32x2_t F[6][6], FZ[6][2], f[2][2], b;
+
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 2;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+        // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+        FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+        // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+        f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+      }
+
+      // Write out the output tile
+      b = vld1_f32(bptr);
+      bptr += 2;
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+          outptrs[i][j] += 2;
+        }
       }
     }
-    inptr += 2;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
+#endif  // __arm_any__
+    for (; channels_remaining; channels_remaining--)
     {
-      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+      // Matrices used and computed during this transform
+      float F[6][6], FZ[6][2], f[2][2], b;
 
-      // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
-      FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-      // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
-      f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
-    }
-
-    // Write out the output tile
-    b = vld1_f32(bptr);
-    bptr += 2;
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
-        outptrs[i][j] += 2;
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = *(inptr + m*matrix_stride);
+        }
+      }
+      inptr++;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[1][j] =                1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+      }
+
+      // Write out the output tile
+      b = *(bptr++);
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          *(outptrs[i][j]++) = f[i][j] + b;
+        }
       }
     }
   }
-#endif  // __arm_any__
-  for (; channels_remaining; channels_remaining--)
+  else
   {
-    // Matrices used and computed during this transform
-    float F[6][6], FZ[6][2], f[2][2], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
+    // For each channel of the output
+    int channels_remaining = n_channels;
+#ifdef __aarch64__
+    for (; channels_remaining >= 4; channels_remaining -= 4)
     {
-      for (int j = 0; j < 6; j++, m++)
+      // Matrices used and computed during this transform
+      float32x4_t F[6][6], FZ[6][2], f[2][2];
+
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        F[i][j] = *(inptr + m*matrix_stride);
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 4;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+        // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+        FZ[i][1] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+        // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+        f[1][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1q_f32(outptrs[i][j], f[i][j]);
+          outptrs[i][j] += 4;
+        }
       }
     }
-    inptr++;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
+#endif  // __aarch64__
+#ifdef __arm_any__
+    for (; channels_remaining >= 2; channels_remaining -= 2)
     {
-      FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
-    }
+      // Matrices used and computed during this transform
+      float32x2_t F[6][6], FZ[6][2], f[2][2];
 
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 2; j++)
-    {
-      f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[1][j] =                1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
-    }
-
-    // Write out the output tile
-    b = *(bptr++);
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        *(outptrs[i][j]++) = f[i][j] + b;
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 2;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+        // FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+        FZ[i][1] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f), F[i][5]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+        // f[1][j] =               1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+        f[1][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f), FZ[5][j]);
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1_f32(outptrs[i][j], f[i][j]);
+          outptrs[i][j] += 2;
+        }
+      }
+    }
+#endif  // __arm_any__
+    for (; channels_remaining; channels_remaining--)
+    {
+      // Matrices used and computed during this transform
+      float F[6][6], FZ[6][2], f[2][2];
+
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
+      {
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = *(inptr + m*matrix_stride);
+        }
+      }
+      inptr++;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][1] =               1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4] +  1*F[i][5];
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 2; j++)
+      {
+        f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[1][j] =                1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j] +  1*FZ[5][j];
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          *(outptrs[i][j]++) = f[i][j];
+        }
       }
     }
   }

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
index 609823b..cd3bdef 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp

@@ -100,170 +100,338 @@
   const float *inptr = matrix_base;
   const float *bptr = biases;
 
-  // For each channel of the output
-  int channels_remaining = n_channels;
-#ifdef __aarch64__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
+  if (bptr)
   {
-    // Matrices used and computed during this transform
-    float32x4_t F[6][6], FZ[6][4], f[4][4], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
+    // For each channel of the output
+    int channels_remaining = n_channels;
+#ifdef __aarch64__
+    for (; channels_remaining >= 4; channels_remaining -= 4)
     {
-      for (int j = 0; j < 6; j++, m++)
+      // Matrices used and computed during this transform
+      float32x4_t F[6][6], FZ[6][4], f[4][4], b;
+
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 4;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+        // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+        FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
+
+        // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+        FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
+
+        // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+        FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 4; j++)
+      {
+        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+        // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+        f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+        // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+        f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+        // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+        f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+      }
+
+      // Write out the output tile
+      b = vld1q_f32(bptr);
+      bptr += 4;
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
+          outptrs[i][j] += 4;
+        }
       }
     }
-    inptr += 4;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
-    {
-      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
-
-      // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-      FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
-
-      // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-      FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
-
-      // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-      FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 4; j++)
-    {
-      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-      // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-      f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
-
-      // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-      f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
-
-      // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-      f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
-    }
-
-    // Write out the output tile
-    b = vld1q_f32(bptr);
-    bptr += 4;
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
-      {
-        vst1q_f32(outptrs[i][j], vaddq_f32(f[i][j], b));
-        outptrs[i][j] += 4;
-      }
-    }
-  }
 #endif  // __aarch64__
 #ifdef __arm_any__
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used and computed during this transform
-    float32x2_t F[6][6], FZ[6][4], f[4][4], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
+    for (; channels_remaining >= 2; channels_remaining -= 2)
     {
-      for (int j = 0; j < 6; j++, m++)
+      // Matrices used and computed during this transform
+      float32x2_t F[6][6], FZ[6][4], f[4][4], b;
+
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 2;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+        // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+        FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
+
+        // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+        FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
+
+        // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+        FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 4; j++)
+      {
+        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+        // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+        f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+        // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+        f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+        // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+        f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+      }
+
+      // Write out the output tile
+      b = vld1_f32(bptr);
+      bptr += 2;
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
+          outptrs[i][j] += 2;
+        }
       }
     }
-    inptr += 2;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
+#endif
+    for (; channels_remaining; channels_remaining--)
     {
-      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+      // Matrices used and computed during this transform
+      float F[6][6], FZ[6][4], f[4][4], b;
 
-      // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-      FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
-
-      // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-      FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
-
-      // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-      FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
-    }
-
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 4; j++)
-    {
-      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
-
-      // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-      f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
-
-      // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-      f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
-
-      // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-      f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
-    }
-
-    // Write out the output tile
-    b = vld1_f32(bptr);
-    bptr += 2;
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        vst1_f32(outptrs[i][j], vadd_f32(f[i][j], b));
-        outptrs[i][j] += 2;
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = *(inptr + m*matrix_stride);
+        }
+      }
+      inptr++;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+        FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+        FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 4; j++)
+      {
+        f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+        f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+        f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+      }
+
+      // Write out the output tile
+      b = *(bptr++);
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          *(outptrs[i][j]++) = f[i][j] + b;
+        }
       }
     }
   }
-#endif
-  for (; channels_remaining; channels_remaining--)
+  else
   {
-    // Matrices used and computed during this transform
-    float F[6][6], FZ[6][4], f[4][4], b;
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = 0, m = 0; i < 6; i++)
+    // For each channel of the output
+    int channels_remaining = n_channels;
+#ifdef __aarch64__
+    for (; channels_remaining >= 4; channels_remaining -= 4)
     {
-      for (int j = 0; j < 6; j++, m++)
+      // Matrices used and computed during this transform
+      float32x4_t F[6][6], FZ[6][4], f[4][4];
+
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        F[i][j] = *(inptr + m*matrix_stride);
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 4;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+        // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+        FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
+
+        // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+        FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
+
+        // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+        FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 4; j++)
+      {
+        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+        // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+        f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+        // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+        f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+        // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+        f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1q_f32(outptrs[i][j], f[i][j]);
+          outptrs[i][j] += 4;
+        }
       }
     }
-    inptr++;
-
-    // Compute the matrix F Z
-    for (int i = 0; i < 6; i++)
+#endif  // __aarch64__
+#ifdef __arm_any__
+    for (; channels_remaining >= 2; channels_remaining -= 2)
     {
-      FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
-      FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
-      FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
-      FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
-    }
+      // Matrices used and computed during this transform
+      float32x2_t F[6][6], FZ[6][4], f[4][4];
 
-    // Compute the output tile f = ZT F Z
-    for (int j = 0; j < 4; j++)
-    {
-      f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
-      f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
-      f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
-      f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
-    }
-
-    // Write out the output tile
-    b = *(bptr++);
-    for (int i = 0; i < cells_i; i++)
-    {
-      for (int j = 0; j < cells_j; j++)
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
       {
-        *(outptrs[i][j]++) = f[i][j] + b;
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = vld1_f32(inptr + m*matrix_stride);
+        }
+      }
+      inptr += 2;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+        // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+        FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
+
+        // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+        FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
+
+        // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+        FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 4; j++)
+      {
+        // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+        // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+        f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+        // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+        f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+        // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+        f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          vst1_f32(outptrs[i][j], f[i][j]);
+          outptrs[i][j] += 2;
+        }
+      }
+    }
+#endif
+    for (; channels_remaining; channels_remaining--)
+    {
+      // Matrices used and computed during this transform
+      float F[6][6], FZ[6][4], f[4][4];
+
+      // Read a 6x6 tile in the Winograd domain
+      for (int i = 0, m = 0; i < 6; i++)
+      {
+        for (int j = 0; j < 6; j++, m++)
+        {
+          F[i][j] = *(inptr + m*matrix_stride);
+        }
+      }
+      inptr++;
+
+      // Compute the matrix F Z
+      for (int i = 0; i < 6; i++)
+      {
+        FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+        FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+        FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+        FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+      }
+
+      // Compute the output tile f = ZT F Z
+      for (int j = 0; j < 4; j++)
+      {
+        f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+        f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+        f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+        f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+      }
+
+      // Write out the output tile
+      for (int i = 0; i < cells_i; i++)
+      {
+        for (int j = 0; j < cells_j; j++)
+        {
+          *(outptrs[i][j]++) = f[i][j];
+        }
       }
     }
   }

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
index c082356..a5d4302 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp

@@ -21,9 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include <cstring>
 #include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
 #include "arm_compute/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.hpp"
-
 using namespace winograd;
 
 /** Get the output shape of a convolution. */
@@ -37,8 +37,8 @@
 {
   return Tensor4DShape {
     in_shape.n_batches,
-  (padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - (kernel_rows - 1),
-  (padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - (kernel_cols - 1),
+    (padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - (kernel_rows - 1),
+    (padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - (kernel_cols - 1),
     kernel_shape.n_output_channels,
     in_shape.ordering
   };
@@ -221,344 +221,6 @@
 }
 
 
-/** Create a new Winograd operator. */
-template <int output_tile_rows, int output_tile_cols,
-          int kernel_rows, int kernel_cols>
-template <typename TOut, typename TIn>
-WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::Convolution<TOut, TIn>::Convolution(
-  const KernelShape &kernel_shape,
-  const Tensor4DShape &input_shape,
-  const PaddingType padding,
-  void *kernel_storage
-) : kernel_shape(kernel_shape),  // Store the kernel shape
-    kernel_matrix_row_stride(roundup(kernel_shape.n_output_channels, N_BLOCK)),
-    manage_kernel_storage(kernel_storage == NULL),
-    _kernel_storage(manage_kernel_storage ?
-                      ALLOCATE(get_kernel_storage_size(kernel_shape)) :
-                      kernel_storage),
-    input_shape(input_shape),
-    padding(padding),
-    output_shape(get_output_shape(kernel_shape, input_shape, padding)),
-    tile_rows(iceildiv(output_shape.n_rows, output_tile_rows)),
-    tile_cols(iceildiv(output_shape.n_cols, output_tile_cols)),
-    M(input_shape.n_batches * tile_rows * tile_cols),
-    K(kernel_shape.n_input_channels),
-    N(kernel_shape.n_output_channels),
-    prof()
-{
-  // Create pointers to the kernel matrices
-  const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape);
-  int8_t* const ks_bytes = reinterpret_cast<int8_t *>(_kernel_storage);
-  for (int i = 0; i < N_GEMMS; i++) {
-    kernel_matrices[i] = reinterpret_cast<TIn *>(
-      ks_bytes + i*kernel_matrix_size_bytes);
-  }
-}
-
-
-/** Create a new Winograd operator and initialise the weights. */
-template <int output_tile_rows, int output_tile_cols,
-          int kernel_rows, int kernel_cols>
-template <typename TOut, typename TIn>
-WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::Convolution<TOut, TIn>::Convolution(
-  const KernelShape &kernel_shape,
-  const Tensor4DShape &input_shape,
-  const PaddingType padding,
-  const TIn* const kernel,
-  void *kernel_storage,
-  void *transform_working_space
-) : Convolution(kernel_shape, input_shape, padding, kernel_storage)
-{
-  transform_weights(kernel, transform_working_space);
-}
-
-
-/** Clean up a convolution engine. */
-template <int output_tile_rows, int output_tile_cols, int kernel_rows, int kernel_cols>
-template <typename TOut, typename TIn>
-WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::
-Convolution<TOut, TIn>::~Convolution()
-{
-  // If we were responsible for managing kernel storage ensure that it is
-  // freed.
-  if (manage_kernel_storage)
-  {
-    free(_kernel_storage);
-  }
-}
-
-
-/** Transform weights into the Winograd domain and store them for later use/reuse. */
-template <int output_tile_rows, int output_tile_cols, int kernel_rows, int kernel_cols>
-template <typename TOut, typename TIn>
-template <typename WeightsTransformT>
-void WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::
-Convolution<TOut, TIn>::transform_weights(
-  const TIn* const kernel,
-  void *transform_working_space
-)
-{
-  // Allocate working space if it is required
-  bool allocated_working_space = false;
-  if (transform_working_space == NULL &&  // If no memory has been provided
-      get_kernel_transform_working_size(kernel_shape) != 0)  // And we need the space
-  {
-    allocated_working_space = true;
-    transform_working_space = ALLOCATE(
-      get_kernel_transform_working_size(kernel_shape)
-    );
-  }
-
-  // The transformation methods only work on weights laid out in HWIO form, if
-  // the weights are not in this form then we need to re-order them.
-  const TIn *kernel_hwio = kernel;
-  if (kernel_shape.ordering != HWIO)
-  {
-    kernel_hwio = reinterpret_cast<TIn *>(transform_working_space);
-
-    // Re-order the weights from OIHW to HWIO
-    this->prof(
-      "Weight reorder",
-      [&kernel, &kernel_hwio, this] () {
-        reorder::ofm_ifm_h_w_to_h_w_ifm_ofm(
-          kernel, const_cast<TIn *>(kernel_hwio),
-          kernel_shape.n_output_channels,
-          kernel_shape.n_input_channels,
-          kernel_shape.n_rows,
-          kernel_shape.n_cols
-        );
-      },
-      kernel_shape.size() * sizeof(TIn),
-      0,
-      kernel_shape.size() * sizeof(TIn)
-    );
-  }
-
-  const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape);
-  WeightsTransformT weights_transform(
-    kernel_hwio, kernel_matrices[0],
-    kernel_matrix_size_bytes / sizeof(TIn),
-    kernel_matrix_row_stride,
-    kernel_shape.n_output_channels,
-    kernel_shape.n_input_channels
-  );
-
-  // Transform the weights into the Winograd domain
-  auto kernel_prep = [&] ()
-  {
-    weights_transform.run(0, weights_transform.get_window());
-  };
-
-  prof(
-    "Kernel Prep", kernel_prep,
-    WeightsTransformT::bytes_read(kernel_shape),
-    WeightsTransformT::ops_performed(kernel_shape),
-    WeightsTransformT::bytes_written(kernel_shape)
-  );
-
-  // Free memory if we allocated it
-  if (allocated_working_space)
-  {
-    free(transform_working_space);
-  }
-}
-
-
-/** Perform a convolution. */
-template <int output_tile_rows, int output_tile_cols,
-          int kernel_rows, int kernel_cols>
-template <typename TOut, typename TIn>
-void WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::
-Convolution<TOut, TIn>::execute(
-  TOut* const output,
-  const TIn* const input,
-  const TOut* const biases,
-  void *working_space,
-  const int n_threads
-)
-{
-  const auto padding_type = padding;
-  const auto input_shape = this->input_shape;
-
-  // Allocate working space if none has been provided
-  const bool manage_working_space = (working_space == NULL);
-  if (manage_working_space)
-  {
-    const size_t ws_size = get_working_space_size(
-      kernel_shape, input_shape, padding_type
-    );
-    working_space = ALLOCATE(ws_size * sizeof(int8_t));
-    memset(working_space, 0x00, ws_size);
-  }
-  int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space);
-
-  // Split the working space into that required for 16 input matrices and
-  // output matrices.
-  TIn *input_matrices[N_GEMMS];
-  TOut *output_matrices[N_GEMMS];
-  const int in_matrix_stride_bytes = get_input_matrix_size(kernel_shape, input_shape, padding_type);
-  const int out_matrix_stride_bytes = get_output_matrix_size(kernel_shape, input_shape, padding_type);
-
-  for (int i = 0; i < N_GEMMS; i++)
-  {
-    input_matrices[i] = reinterpret_cast<TIn *>(
-        ws_bytes + i*in_matrix_stride_bytes);
-    output_matrices[i] = reinterpret_cast<TIn *>(
-        ws_bytes + N_GEMMS*in_matrix_stride_bytes + i*out_matrix_stride_bytes);
-  }
-
-  // If we need to re-order the input and output tensors then the final chunk
-  // of the working space can be used for this purpose.
-  const TIn* input_nhwc = input;
-  if (input_shape.ordering == NCHW)
-  {
-    input_nhwc = reinterpret_cast<TIn *>(
-      ws_bytes + N_GEMMS*(in_matrix_stride_bytes + out_matrix_stride_bytes)
-    );
-
-    this->prof(
-      "NCHW -> NHWC",
-      [input, input_shape, input_nhwc] () {
-        reorder::nchw_to_nhwc(
-          input, const_cast<TIn *>(input_nhwc),
-          input_shape.n_batches,
-          input_shape.n_channels,
-          input_shape.n_rows,
-          input_shape.n_cols
-        );
-      },
-      input_shape.size(), 0, input_shape.size()
-    );
-  }
-
-  // Compute shape for the GEMM
-  const auto output_shape = this->output_shape;
-  int M = this->M;
-  int K = this->K;
-  int N = this->N;
-
-  const int in_matrix_row_stride = K;
-  const int out_matrix_row_stride = kernel_matrix_row_stride;
-
-  InputTransform<TIn> input_transform(
-    input_nhwc,
-    input_shape.n_batches,
-    input_shape.n_rows,
-    input_shape.n_cols,
-    input_shape.n_channels,
-    padding_type,
-    input_matrices[0],
-    in_matrix_stride_bytes / sizeof(TIn),
-    in_matrix_row_stride
-  );
-
-  // Transform the input into the Winograd domain
-  auto input_prep = [&] () {
-    input_transform.run(0, input_transform.get_window());
-  };
-  prof(
-    "Input Prep", input_prep,
-    InputTransform<TIn>::bytes_read(input_shape),
-    InputTransform<TIn>::ops_performed(input_shape),
-    InputTransform<TIn>::bytes_written(input_shape)
-  );
-
-  // Perform the GEMMs
-  const int kernel_matrix_stride_bytes = get_kernel_matrix_size(kernel_shape);
-  BatchedBlockedGemm<M_BLOCK, N_BLOCK, TOut, TIn> gemms(
-    N_GEMMS, M, K, N,
-    in_matrix_stride_bytes / sizeof(TIn),
-    in_matrix_row_stride,
-    kernel_matrix_stride_bytes / sizeof(TIn),
-    kernel_matrix_row_stride,
-    out_matrix_stride_bytes / sizeof(TOut),
-    out_matrix_row_stride,
-    input_matrices[0],
-    kernel_matrices[0],
-    output_matrices[0]
-  );
-  for (unsigned int i = 0; i < gemms.get_window(); i++)
-  {
-    auto run_gemm = [&] () { gemms.run(i, i+1); };
-    prof("GEMM", run_gemm, 0, 0, 0);
-  }
-
-  // If the output tensor needs to be in NCHW form then store the NHWC output
-  // tensor in temporary storage and then reorder. If the output tensor needs
-  // to be in NHWC then just write straight to the output tensor.
-  TOut *output_nhwc = output;
-  if (input_shape.ordering == NCHW)
-  {
-    output_nhwc = reinterpret_cast<TOut *>(
-      ws_bytes + N_GEMMS*(in_matrix_stride_bytes + out_matrix_stride_bytes)
-    );
-  }
-
-  // Transform the output tensor from the Winograd domain to the spatial
-  // domain.
-  OutputTransform<TOut> output_transform(
-    output_matrices[0],
-    out_matrix_stride_bytes / sizeof(TOut),
-    out_matrix_row_stride,
-    biases,
-    output_nhwc,
-    output_shape.n_batches,
-    output_shape.n_rows,
-    output_shape.n_cols,
-    output_shape.n_channels
-  );
-  auto output_prep = [&] () {
-    output_transform.run(0, output_transform.get_window());
-  };
-  prof(
-    "Output Comp", output_prep,
-    OutputTransform<TOut>::bytes_read(output_shape),
-    OutputTransform<TOut>::ops_performed(output_shape),
-    OutputTransform<TOut>::bytes_written(output_shape)
-  );
-
-  // Reorder the output tensor if it is required to be in NCHW form.
-  if (input_shape.ordering == NCHW)
-  {
-    prof(
-      "NHWC -> NCHW",
-      [output_nhwc, output_shape, output] () {
-        reorder::nhwc_to_nchw(
-          output_nhwc, output,
-          output_shape.n_batches,
-          output_shape.n_rows,
-          output_shape.n_cols,
-          output_shape.n_channels
-        );
-      },
-      output_shape.size(), 0, output_shape.size()
-    );
-  }
-
-  // Free working space if we were responsible for allocating it
-  if (manage_working_space)
-  {
-    free(working_space);
-  }
-}
-
-
-/** Perform a convolution. */
-template <int output_tile_rows, int output_tile_cols,
-          int kernel_rows, int kernel_cols>
-template <typename TOut, typename TIn>
-void WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::
-Convolution<TOut, TIn>::execute(
-  TOut* const output,
-  const TIn* const input,
-  const TOut* const biases,
-  const int n_threads
-)
-{
-  execute(output, input, biases, NULL, n_threads);
-}
-
-
 // Instantiate required implementations
 template class WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>;
 template class WinogradGEMM<4, 4, 3, 3>::Convolution<float, float>;

diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp
index 836c379..237f133 100644
--- a/src/core/SubTensorInfo.cpp
+++ b/src/core/SubTensorInfo.cpp

@@ -100,6 +100,7 @@
     if(_parent->tensor_shape().total_size() != 0 && !_extend_parent)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(_parent->tensor_shape(), _coords, shape);
+        _valid_region = ValidRegion{ _coords, shape };
     }
     else if(_extend_parent) // Extend parent shape, configure if specified
     {

diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index bd0c85f..676938a 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp

@@ -34,7 +34,7 @@
 
 TensorInfo::TensorInfo()
     : _total_size(0), _fixed_point_position(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN),
-      _is_resizable{ true }, _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info()
+      _is_resizable{ true }, _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info(), _data_layout(DataLayout::NCHW)
 {
 }
 
@@ -53,6 +53,7 @@
     _valid_region                  = info.valid_region();
     _padding                       = info.padding();
     _quantization_info             = info.quantization_info();
+    _data_layout                   = info.data_layout();
 }
 
 TensorInfo::TensorInfo(Format format)
@@ -167,13 +168,13 @@
     // Number of cells for each block
     const Size2D num_cells_per_block = hog_info.num_cells_per_block();
 
-    // Tensor Size = (Number of horizontal blocks) * (Number of vertical blocks )
-    const Size2D num_blocks_per_img = hog_info.num_blocks_per_image(Size2D(width, height));
+    // Tensor Size = (Number of horizontal block positions) * (Number of vertical block positions)
+    const Size2D num_block_positions_per_img = hog_info.num_block_positions_per_image(Size2D(width, height));
 
     // Number of tensor channels = (Number of cells per block) * (Number of bins per cell)
     const size_t num_channels = num_cells_per_block.area() * hog_info.num_bins();
 
-    init(TensorShape(num_blocks_per_img.width, num_blocks_per_img.height), num_channels, DataType::F32);
+    init(TensorShape(num_block_positions_per_img.width, num_block_positions_per_img.height), num_channels, DataType::F32);
 }
 
 size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, Format format)
@@ -211,13 +212,13 @@
     // Number of cells for each block
     const Size2D num_cells_per_block = hog_info.num_cells_per_block();
 
-    // Tensor Size = (Number of horizontal blocks) * (Number of vertical blocks )
-    const Size2D num_blocks_per_img = hog_info.num_blocks_per_image(Size2D(width, height));
+    // Tensor Size = (Number of horizontal block positions) * (Number of vertical block positions)
+    const Size2D num_block_positions_per_img = hog_info.num_block_positions_per_image(Size2D(width, height));
 
     // Number of tensor channels = (Number of cells per block) * (Number of bins per cell)
     const size_t num_channels = num_cells_per_block.area() * hog_info.num_bins();
 
-    return init_auto_padding(TensorShape(num_blocks_per_img.width, num_blocks_per_img.height), num_channels, DataType::F32);
+    return init_auto_padding(TensorShape(num_block_positions_per_img.width, num_block_positions_per_img.height), num_channels, DataType::F32);
 }
 
 bool TensorInfo::auto_padding()
@@ -321,7 +322,7 @@
 {
     _data_type = data_type;
     _format    = Format::UNKNOWN;
-    return *this;
+    return set_tensor_shape(tensor_shape()); // Force total size and strides to update
 }
 
 ITensorInfo &TensorInfo::set_num_channels(int num_channels)
@@ -384,6 +385,12 @@
     return *this;
 }
 
+ITensorInfo &TensorInfo::set_data_layout(const DataLayout &data_layout)
+{
+    _data_layout = data_layout;
+    return *this;
+}
+
 ITensorInfo &TensorInfo::reset_padding()
 {
     _padding = PaddingSize();

diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index f4b4553..b1c5992 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp

@@ -126,6 +126,18 @@
     return channels_map[channel];
 }
 
+const std::string &arm_compute::string_from_data_layout(DataLayout dl)
+{
+    static std::map<DataLayout, const std::string> dl_map =
+    {
+        { DataLayout::UNKNOWN, "UNKNOWN" },
+        { DataLayout::NCHW, "NCHW" },
+        { DataLayout::NHWC, "NHWC" },
+    };
+
+    return dl_map[dl];
+}
+
 const std::string &arm_compute::string_from_data_type(DataType dt)
 {
     static std::map<DataType, const std::string> dt_map =
@@ -145,6 +157,7 @@
         { DataType::F32, "F32" },
         { DataType::F64, "F64" },
         { DataType::SIZET, "SIZET" },
+        { DataType::QASYMM8, "QASYMM8" },
     };
 
     return dt_map[dt];
@@ -292,7 +305,8 @@
 
 const std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsigned int width, unsigned int height,
                                                                            unsigned int kernel_width, unsigned int kernel_height,
-                                                                           const PadStrideInfo &pad_stride_info)
+                                                                           const PadStrideInfo &pad_stride_info,
+                                                                           const Size2D        &dilation)
 {
     const unsigned int pad_left   = pad_stride_info.pad_left();
     const unsigned int pad_top    = pad_stride_info.pad_top();
@@ -305,12 +319,12 @@
     switch(pad_stride_info.round())
     {
         case DimensionRoundingType::FLOOR:
-            w = static_cast<unsigned int>(std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<unsigned int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            w = static_cast<unsigned int>(std::floor((static_cast<float>(width + pad_left + pad_right - (dilation.x() * (kernel_width - 1) + 1)) / stride_x) + 1));
+            h = static_cast<unsigned int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - (dilation.y() * (kernel_height - 1) + 1)) / stride_y) + 1));
             break;
         case DimensionRoundingType::CEIL:
-            w = static_cast<unsigned int>(std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<unsigned int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            w = static_cast<unsigned int>(std::ceil((static_cast<float>(width + pad_left + pad_right - (dilation.x() * (kernel_width - 1) + 1)) / stride_x) + 1));
+            h = static_cast<unsigned int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - (dilation.y() * (kernel_height - 1) + 1)) / stride_y) + 1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");

diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
index f5f9f1f..d4fabd4 100644
--- a/src/core/Validate.cpp
+++ b/src/core/Validate.cpp

@@ -167,9 +167,9 @@
                                                             const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape)
 {
     // Subtensor should not index in x, y dimensions.
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC(((coords.x() != 0) && (coords.y() != 0)), function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(((coords.x() != 0) || (coords.y() != 0)), function, file, line);
     // Subtensor shape should match parent tensor in x, y dimensions.
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC(((parent_shape.x() != shape.x()) && (parent_shape.y() != parent_shape.y())), function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(((parent_shape.x() != shape.x()) || (parent_shape.y() != parent_shape.y())), function, file, line);
 
     // Check dimensions
     for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)

diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index 2fe3a90..e1ffeed 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,292 +23,205 @@
  */
 #include "arm_compute/graph/Graph.h"
 
-#include "arm_compute/graph/CL/CLMap.h"
-#include "arm_compute/graph/CL/CLUnmap.h"
-#include "arm_compute/graph/INode.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/graph/Tensor.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "support/ToolchainSupport.h"
-
-#include <sys/stat.h>
-
-using namespace arm_compute::graph;
-
-namespace
+namespace arm_compute
 {
-bool file_exists(const std::string &filename)
+namespace graph
 {
-    std::ifstream file(filename);
-    return file.good();
+Graph::Graph(GraphID id, std::string name)
+    : _id(id), _name(std::move(name)), _nodes(), _edges(), _tensors(), _tagged_nodes(), _mtx()
+{
 }
 
-} // namespace
-struct Stage
+bool Graph::remove_node(NodeID nid)
 {
-    ITensorObject                          *_input;
-    ITensorObject                          *_output;
-    std::unique_ptr<arm_compute::IFunction> _function;
-};
-
-struct Graph::Private
-{
-public:
-    /** Finalizes the current node's configuration
-     *
-     * @param _next_hint Device execution hint
-     */
-    void configure(GraphHints _next_hints);
-
-    GraphContext                                _ctx{};
-    std::vector<Stage>                          _pipeline{};
-    std::vector<std::unique_ptr<ITensorObject>> _tensors{};
-    std::vector<std::unique_ptr<INode>>         _nodes{};
-    GraphHints                                  _current_hints{};
-    GraphHints                                  _next_hints{};
-    std::unique_ptr<ITensorObject>              _graph_input{ nullptr };
-    std::unique_ptr<ITensorObject>              _graph_output{ nullptr };
-    std::unique_ptr<INode>                      _current_node{ nullptr };
-    ITensorObject                              *_current_output{ nullptr };
-    bool                                        _info_enabled{ false };
-    CLTuner                                     _tuner{};
-
-private:
-    ITensorObject *_current_input{ nullptr };
-    GraphHints     _previous_hints{};
-};
-
-static const std::string tuner_data_filename = "acl_tuner.csv";
-Graph::~Graph() //NOLINT
-{
-    if(_pimpl->_tuner.tune_new_kernels() && !_pimpl->_tuner.lws_table().empty())
+    if(nid >= _nodes.size())
     {
-        _pimpl->_tuner.save_to_file(tuner_data_filename);
+        return false;
     }
-}
 
-Graph::Graph()
-    : _pimpl{ new Private() }
-{
-    graph_init();
-}
+    std::unique_ptr<INode> &node = _nodes[nid];
 
-void Graph::graph_init(const bool use_cl_tuner)
-{
-    // Check if OpenCL is available and initialize the scheduler
-    if(opencl_is_available())
+    // Remove node connections
+    if(node)
     {
-        if(_pimpl->_tuner.lws_table().empty() && file_exists(tuner_data_filename))
+        for(auto &input_eid : node->_input_edges)
         {
-            _pimpl->_tuner.load_from_file(tuner_data_filename);
+            remove_connection(input_eid);
         }
-        _pimpl->_tuner.set_tune_new_kernels(use_cl_tuner);
-        arm_compute::CLScheduler::get().default_init(&_pimpl->_tuner);
-    }
-}
-void Graph::run()
-{
-    while(true)
-    {
-        if(_pimpl->_graph_input->has_accessor() && !_pimpl->_graph_input->call_accessor())
+        for(auto &outpud_eid : node->_output_edges)
         {
-            return;
-        }
-
-        for(auto &stage : _pimpl->_pipeline)
-        {
-            stage._function->run();
-        }
-
-        if((_pimpl->_graph_output->has_accessor() && !_pimpl->_graph_output->call_accessor())
-           || (!_pimpl->_graph_output->has_accessor()))
-        {
-            return;
-        }
-    }
-}
-
-//Finalize current node's configuration
-void Graph::Private::configure(GraphHints _next_hints)
-{
-    ARM_COMPUTE_ERROR_ON(_current_node == nullptr);
-    ARM_COMPUTE_ERROR_ON(_graph_input == nullptr);
-
-    // Is it the first node of the graph ?
-    if(_current_input == nullptr)
-    {
-        _graph_input->set_target(_current_hints.target_hint());
-        _current_input  = _graph_input.get();
-        _previous_hints = _current_hints; // For the first node just assume the previous node was of the same type as this one
-    }
-
-    if(_current_node->supports_in_place())
-    {
-        _current_output = _current_input;
-    }
-
-    //Automatic output configuration ?
-    if(_current_output == nullptr)
-    {
-        _tensors.push_back(arm_compute::support::cpp14::make_unique<Tensor>(TensorInfo()));
-        _current_output = _tensors.back().get();
-    }
-
-    // If either the writer or reader node needs OpenCL then use OpenCL memory:
-    if((_next_hints.target_hint() == TargetHint::OPENCL || _current_hints.target_hint() == TargetHint::OPENCL))
-    {
-        _current_output->set_target(TargetHint::OPENCL);
-    }
-    else
-    {
-        _current_output->set_target(TargetHint::NEON);
-    }
-
-    // Instantiate Node
-    _ctx.hints()                                 = _current_hints;
-    std::unique_ptr<arm_compute::IFunction> func = _current_node->instantiate_node(_ctx, _current_input, _current_output);
-
-    // If the operation is done in-place, do not allocate or it will prevent following layers from performing the configuration
-    if(!_current_node->supports_in_place())
-    {
-        // Allocate current input
-        _current_input->allocate();
-    }
-
-    // Map input if needed
-    if(_current_input->target() == TargetHint::OPENCL)
-    {
-        if(_previous_hints.target_hint() == TargetHint::NEON)
-        {
-            ARM_COMPUTE_ERROR_ON(_current_hints.target_hint() == TargetHint::NEON);
-            _pipeline.push_back({ _current_input, _current_input, arm_compute::support::cpp14::make_unique<CLUnmap>(_current_input) });
-        }
-        if(_current_hints.target_hint() == TargetHint::NEON)
-        {
-            ARM_COMPUTE_ERROR_ON(_previous_hints.target_hint() == TargetHint::NEON);
-            _pipeline.push_back({ _current_input, _current_input, arm_compute::support::cpp14::make_unique<CLMap>(_current_input, true) });
+            remove_connection(outpud_eid);
         }
     }
 
-    _pipeline.push_back({ _current_input, _current_output, std::move(func) });
+    node = nullptr;
 
-    _current_input  = _current_output;
-    _current_output = nullptr;
-    std::swap(_previous_hints, _current_hints);
-    std::swap(_current_hints, _next_hints);
+    return true;
 }
 
-void Graph::add_node(std::unique_ptr<INode> node)
+EdgeID Graph::add_connection(NodeID source, size_t source_idx, NodeID sink, size_t sink_idx)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(_pimpl->_graph_input == nullptr, "The graph's input must be set before the first node is added");
-    ARM_COMPUTE_ERROR_ON_MSG(_pimpl->_graph_output != nullptr, "Nothing can be added after the output tensor");
-    //Trigger the creation of the current Node:
+    std::lock_guard<arm_compute::Mutex> lock(_mtx);
 
-    GraphHints _next_hints = _pimpl->_next_hints;
-    _next_hints.set_target_hint(node->override_target_hint(_pimpl->_next_hints.target_hint()));
-    ARM_COMPUTE_ERROR_ON(_next_hints.target_hint() == TargetHint::DONT_CARE);
-    if(_pimpl->_current_node)
+    // Check if node index is valid, if node exists and finally if the connection index is valid
+    ARM_COMPUTE_ERROR_ON((source >= _nodes.size()) || (_nodes[source] == nullptr) || (source_idx >= _nodes[source]->num_outputs()));
+    ARM_COMPUTE_ERROR_ON((sink >= _nodes.size()) || (_nodes[sink] == nullptr) || (sink_idx >= _nodes[sink]->num_inputs()));
+
+    // Get nodes
+    std::unique_ptr<INode> &source_node = _nodes[source];
+    std::unique_ptr<INode> &sink_node   = _nodes[sink];
+
+    // Check for duplicate connections (Check only sink node)
+    Edge *sink_node_edge = sink_node->input_edge(sink_idx);
+    if((sink_node_edge != nullptr) && (sink_node_edge->producer_id() == source) && (sink_node_edge->producer_idx() == source_idx)
+       && (sink_node_edge->consumer_id() == sink) && (sink_node_edge->consumer_idx() == sink_idx))
     {
-        //Finalize the previous Node:
-        _pimpl->configure(_pimpl->_next_hints);
+        return sink_node_edge->id();
     }
-    else
+
+    // Check if there is already a tensor associated with output if not create one
+    TensorID tid = source_node->output_id(source_idx);
+    if(tid == NullTensorID)
     {
-        // If that's the first node then use the same TargetHint before and after the node.
-        _pimpl->_current_hints = _next_hints;
+        tid = create_tensor();
     }
-    if(_pimpl->_current_node)
+    std::unique_ptr<Tensor> &tensor = _tensors[tid];
+
+    // Create connections
+    EdgeID eid        = _edges.size();
+    auto   connection = arm_compute::support::cpp14::make_unique<Edge>(eid, source_node.get(), source_idx, sink_node.get(), sink_idx, tensor.get());
+    _edges.push_back(std::move(connection));
+
+    // Add connections to source and sink nodes
+    source_node->_output_edges.insert(eid);
+    sink_node->_input_edges[sink_idx] = eid;
+
+    // Set tensor output node
+    source_node->_outputs[source_idx] = tid;
+
+    // Bind tensor to the edge
+    tensor->bind_edge(eid);
+
+    // Try and propagate shapes in sink node
+    sink_node->forward_descriptors();
+
+    return eid;
+}
+
+bool Graph::remove_connection(EdgeID eid)
+{
+    if(eid >= _edges.size())
     {
-        _pimpl->_nodes.push_back(std::move(_pimpl->_current_node));
+        return false;
     }
-    _pimpl->_current_node = std::move(node);
-}
 
-//Add a tensor with an Accessor (i.e either the input or output of the graph)
-void Graph::add_tensor_object(std::unique_ptr<ITensorObject> tensor)
-{
-    // If it's the first Tensor added then it will be the input of the Graph.
-    if(_pimpl->_graph_input == nullptr)
+    std::unique_ptr<Edge> &edge = _edges[eid];
+
+    // Remove node connections
+    if(edge != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON(_pimpl->_graph_output != nullptr);
-        ARM_COMPUTE_ERROR_ON(_pimpl->_current_node != nullptr);
-        _pimpl->_graph_input = std::move(tensor);
+        // Get tensor bound to the edge
+        if(edge->tensor() != nullptr)
+        {
+            edge->tensor()->unbind_edge(eid);
+        }
+
+        // Remove edges from source node
+        if(edge->producer() != nullptr)
+        {
+            edge->producer()->_output_edges.erase(eid);
+        }
+
+        // Remove edges from sink node
+        if((edge->consumer() != nullptr) && (edge->consumer_idx() < edge->consumer()->_input_edges.size()))
+        {
+            edge->consumer()->_input_edges[edge->consumer_idx()] = EmptyEdgeID;
+        }
     }
-    else
-    {
-        // Else it will be the output of the Graph
-        ARM_COMPUTE_ERROR_ON(_pimpl->_graph_output != nullptr);
-        ARM_COMPUTE_ERROR_ON(_pimpl->_current_node == nullptr);
-        _pimpl->_graph_output   = std::move(tensor);
-        _pimpl->_current_output = _pimpl->_graph_output.get();
 
-        // Finalize the graph by configuring the last Node of the graph:
-        _pimpl->configure(_pimpl->_current_hints); // Ignore _next_hint as this is the last node, and just use the same hint as before this node.
-        _pimpl->_graph_output->allocate();
-    }
+    // Clear edge
+    edge = nullptr;
+
+    return true;
 }
 
-bool Graph::opencl_is_available()
+TensorID Graph::create_tensor(TensorDescriptor desc)
 {
-    return arm_compute::opencl_is_available();
+    TensorID tid    = _tensors.size();
+    auto     tensor = support::cpp14::make_unique<Tensor>(tid, desc);
+    _tensors.push_back(std::move(tensor));
+
+    return tid;
 }
 
-arm_compute::GPUTarget Graph::gpu_target()
+std::string Graph::name() const
 {
-    // Check if OpenCL is available before returning the GPU target
-    if(opencl_is_available())
-    {
-        return arm_compute::CLScheduler::get().target();
-    }
-    else
-    {
-        return GPUTarget::MIDGARD;
-    }
+    return _name;
 }
 
-void Graph::set_temp(TensorInfo &&tmp)
+GraphID Graph::id() const
 {
-    ARM_COMPUTE_ERROR_ON(_pimpl->_graph_input == nullptr);
-    ARM_COMPUTE_ERROR_ON(_pimpl->_graph_output != nullptr);
-    ARM_COMPUTE_ERROR_ON_MSG(_pimpl->_current_output != nullptr, "TensorInfo for temporary tensor already set");
-
-    _pimpl->_tensors.push_back(arm_compute::support::cpp14::make_unique<Tensor>(std::move(tmp)));
-    _pimpl->_current_output = _pimpl->_tensors.back().get();
+    return _id;
 }
 
-GraphHints &Graph::hints()
+const std::vector<NodeID> &Graph::inputs()
 {
-    return _pimpl->_next_hints;
+    return _tagged_nodes[NodeType::Input];
 }
 
-Graph &arm_compute::graph::operator<<(Graph &graph, TensorInfo &&info)
+std::vector<std::unique_ptr<INode>> &Graph::nodes()
 {
-    graph.set_temp(std::move(info));
-    return graph;
+    return _nodes;
 }
 
-Graph &arm_compute::graph::operator<<(Graph &graph, Tensor &&tensor)
+const std::vector<std::unique_ptr<INode>> &Graph::nodes() const
 {
-    graph.add_tensor_object(arm_compute::support::cpp14::make_unique<Tensor>(std::move(tensor)));
-    return graph;
+    return _nodes;
 }
 
-Graph &arm_compute::graph::operator<<(Graph &graph, SubTensor &&sub_tensor)
+const std::vector<std::unique_ptr<Edge>> &Graph::edges() const
 {
-    graph.add_tensor_object(arm_compute::support::cpp14::make_unique<SubTensor>(std::move(sub_tensor)));
-    return graph;
+    return _edges;
 }
 
-Graph &arm_compute::graph::operator<<(Graph &graph, TargetHint target_hint)
+std::vector<std::unique_ptr<Tensor>> &Graph::tensors()
 {
-    graph.hints().set_target_hint(target_hint);
-    return graph;
+    return _tensors;
 }
 
-Graph &arm_compute::graph::operator<<(Graph &graph, ConvolutionMethodHint conv_method_hint)
+const std::vector<std::unique_ptr<Tensor>> &Graph::tensors() const
 {
-    graph.hints().set_convolution_method_hint(conv_method_hint);
-    return graph;
+    return _tensors;
 }
+
+const INode *Graph::node(NodeID id) const
+{
+    return (id >= _nodes.size()) ? nullptr : _nodes[id].get();
+}
+
+INode *Graph::node(NodeID id)
+{
+    return (id >= _nodes.size()) ? nullptr : _nodes[id].get();
+}
+
+const Edge *Graph::edge(EdgeID id) const
+{
+    return (id >= _edges.size()) ? nullptr : _edges[id].get();
+}
+
+Edge *Graph::edge(EdgeID id)
+{
+    return (id >= _edges.size()) ? nullptr : _edges[id].get();
+}
+
+const Tensor *Graph::tensor(TensorID id) const
+{
+    return (id >= _tensors.size()) ? nullptr : _tensors[id].get();
+}
+
+Tensor *Graph::tensor(TensorID id)
+{
+    return (id >= _tensors.size()) ? nullptr : _tensors[id].get();
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp
new file mode 100644
index 0000000..4c5d30a
--- /dev/null
+++ b/src/graph/GraphBuilder.cpp

@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/GraphBuilder.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Utils.h"
+#include "arm_compute/graph/algorithms/BFS.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+
+#define CHECK_NODEIDX_PAIR(pair, g) \
+    ARM_COMPUTE_ERROR_ON(((pair).node_id >= (g).nodes().size()) || ((g).node((pair).node_id) == nullptr) || ((pair).index >= (g).node((pair).node_id)->num_outputs()));
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace
+{
+Status set_node_params(Graph &g, NodeID nid, NodeParams &params)
+{
+    INode *node = g.node(nid);
+    ARM_COMPUTE_RETURN_ERROR_ON(!node);
+
+    node->set_common_node_parameters(params);
+
+    return Status{};
+}
+
+Status set_accessor_on_node(Graph &g, NodeID nid, bool is_output, size_t idx, ITensorAccessorUPtr accessor)
+{
+    INode *node = g.node(nid);
+    ARM_COMPUTE_RETURN_ERROR_ON(!node);
+
+    Tensor *tensor = is_output ? node->output(idx) : node->input(idx);
+    ARM_COMPUTE_RETURN_ERROR_ON(!tensor);
+
+    tensor->set_accessor(std::move(accessor));
+
+    return Status{};
+}
+
+NodeID add_const_node_with_name(Graph &g, NodeParams params, const std::string &name, TensorDescriptor desc, ITensorAccessorUPtr accessor)
+{
+    params.name = params.name.empty() ? "" : params.name + name;
+    auto nid    = GraphBuilder::add_const_node(g, params, std::move(desc), std::move(accessor));
+    set_node_params(g, nid, params);
+    return nid;
+}
+
+template <typename NT, typename... Args>
+NodeID create_simple_single_input_output_node(Graph &g, NodeParams &params, NodeIdxPair input, Args &&... args)
+{
+    CHECK_NODEIDX_PAIR(input, g);
+
+    NodeID nid = g.add_node<NT>(std::forward<Args>(args)...);
+    g.add_connection(input.node_id, input.index, nid, 0);
+    set_node_params(g, nid, params);
+
+    return nid;
+}
+
+NodeID create_grouped_convolution(Graph &g, NodeParams &params, NodeIdxPair input, NodeID weights, NodeID bias,
+                                  PadStrideInfo conv_info, ConvolutionMethod method, FastMathHint fast_math_hint, unsigned int num_groups)
+{
+    bool has_bias = (bias != EmptyNodeID);
+
+    // Split input
+    NodeID input_split = GraphBuilder::add_split_node(g, params, input, num_groups, 2);
+
+    // Split weights
+    NodeID weights_split = GraphBuilder::add_split_node(g, params, { weights, 0 }, num_groups, 3);
+
+    // Split bias
+    NodeID bias_split = EmptyNodeID;
+    if(has_bias)
+    {
+        // Split bias
+        bias_split = GraphBuilder::add_split_node(g, params, { bias, 0 }, num_groups, 0);
+    }
+
+    std::vector<NodeIdxPair> convolution_outputs;
+    for(unsigned int i = 0; i < num_groups; ++i)
+    {
+        NodeID conv_nid = g.add_node<ConvolutionLayerNode>(conv_info, method, fast_math_hint);
+        g.add_connection(input_split, i, conv_nid, 0);
+        g.add_connection(weights_split, i, conv_nid, 1);
+        if(has_bias)
+        {
+            g.add_connection(bias_split, i, conv_nid, 2);
+        }
+        set_node_params(g, conv_nid, params);
+        convolution_outputs.push_back({ conv_nid, 0 });
+    }
+
+    // Depth concatenate output
+    return GraphBuilder::add_depth_concatenate_node(g, params, convolution_outputs);
+}
+} // namespace
+
+NodeID GraphBuilder::add_const_node(Graph &g, NodeParams params, TensorDescriptor desc, ITensorAccessorUPtr accessor)
+{
+    auto nid = g.add_node<ConstNode>(desc);
+    set_node_params(g, nid, params);
+    set_accessor_on_node(g, nid, true, 0, std::move(accessor));
+    return nid;
+}
+
+NodeID GraphBuilder::add_input_node(Graph &g, NodeParams params, TensorDescriptor desc, ITensorAccessorUPtr accessor)
+{
+    auto nid = g.add_node<InputNode>(desc);
+    set_node_params(g, nid, params);
+    set_accessor_on_node(g, nid, true, 0, std::move(accessor));
+    return nid;
+}
+
+NodeID GraphBuilder::add_output_node(Graph &g, NodeParams params, NodeIdxPair input, ITensorAccessorUPtr accessor)
+{
+    CHECK_NODEIDX_PAIR(input, g);
+
+    NodeID nid = g.add_node<OutputNode>();
+    g.add_connection(input.node_id, input.index, nid, 0);
+    set_node_params(g, nid, params);
+    set_accessor_on_node(g, nid, false, 0, std::move(accessor));
+
+    return nid;
+}
+
+NodeID GraphBuilder::add_activation_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info)
+{
+    return create_simple_single_input_output_node<ActivationLayerNode>(g, params, input, act_info);
+}
+
+NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, float epsilon,
+                                                  ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr var_accessor,
+                                                  ITensorAccessorUPtr beta_accessor, ITensorAccessorUPtr gamma_accessor)
+{
+    CHECK_NODEIDX_PAIR(input, g);
+
+    bool has_beta  = (beta_accessor != nullptr);
+    bool has_gamma = (gamma_accessor != nullptr);
+
+    // Get input tensor descriptor
+    const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+
+    // Calculate Common Descriptor
+    TensorDescriptor common_desc = input_tensor_desc;
+    common_desc.shape            = TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL));
+
+    // Create mean and nodes
+    auto mean_nid = add_const_node_with_name(g, params, "Mean", common_desc, std::move(mean_accessor));
+    auto var_nid  = add_const_node_with_name(g, params, "Variance", common_desc, std::move(var_accessor));
+
+    // Create beta node
+    NodeID beta_nid = EmptyNodeID;
+    if(has_beta)
+    {
+        beta_nid = add_const_node_with_name(g, params, "Beta", common_desc, std::move(beta_accessor));
+    }
+
+    // Create gamma node
+    NodeID gamma_nid = EmptyNodeID;
+    if(has_gamma)
+    {
+        gamma_nid = add_const_node_with_name(g, params, "Gamma", common_desc, std::move(gamma_accessor));
+    }
+
+    // Create batch normalization node and add connections
+    NodeID batch_norm_nid = g.add_node<BatchNormalizationLayerNode>(epsilon);
+    g.add_connection(input.node_id, input.index, batch_norm_nid, 0);
+    g.add_connection(mean_nid, 0, batch_norm_nid, 1);
+    g.add_connection(var_nid, 0, batch_norm_nid, 2);
+    if(has_beta)
+    {
+        g.add_connection(beta_nid, 0, batch_norm_nid, 3);
+    }
+    if(has_gamma)
+    {
+        g.add_connection(gamma_nid, 0, batch_norm_nid, 4);
+    }
+    set_node_params(g, batch_norm_nid, params);
+
+    return batch_norm_nid;
+}
+
+NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPair input,
+                                          Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo conv_info,
+                                          unsigned int num_groups, ConvolutionMethod method, FastMathHint fast_math_hint,
+                                          ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor,
+                                          const QuantizationInfo weights_quant_info,
+                                          const QuantizationInfo out_quant_info)
+{
+    CHECK_NODEIDX_PAIR(input, g);
+    ARM_COMPUTE_ERROR_ON(depth == 0);
+    ARM_COMPUTE_ERROR_ON((kernel_spatial_extend.width == 0) || (kernel_spatial_extend.height == 0));
+
+    bool has_bias = (bias_accessor != nullptr);
+
+    // Get input tensor descriptor
+    const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+
+    // Create weights node
+    TensorDescriptor w_desc = input_tensor_desc;
+    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
+    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
+    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL),
+                     get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) / num_groups);
+    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::BATCHES), depth);
+    if(!weights_quant_info.empty())
+    {
+        w_desc.quant_info = weights_quant_info;
+    }
+
+    NodeID w_nid = add_const_node_with_name(g, params, "Weights", w_desc, std::move(weights_accessor));
+
+    // Create bias nodes
+    NodeID b_nid = EmptyNodeID;
+    if(has_bias)
+    {
+        TensorDescriptor b_desc = input_tensor_desc;
+        b_desc.shape            = TensorShape(depth);
+        b_nid                   = add_const_node_with_name(g, params, "Bias", b_desc, std::move(bias_accessor));
+    }
+
+    if(num_groups == 1)
+    {
+        // Create convolution node and connect
+        NodeID conv_nid = g.add_node<ConvolutionLayerNode>(conv_info, method, fast_math_hint, out_quant_info);
+        g.add_connection(input.node_id, input.index, conv_nid, 0);
+        g.add_connection(w_nid, 0, conv_nid, 1);
+        if(has_bias)
+        {
+            g.add_connection(b_nid, 0, conv_nid, 2);
+        }
+        set_node_params(g, conv_nid, params);
+
+        return conv_nid;
+    }
+    else
+    {
+        return create_grouped_convolution(g, params, input, w_nid, b_nid, conv_info, method, fast_math_hint, num_groups);
+    }
+}
+
+NodeID GraphBuilder::add_depth_concatenate_node(Graph &g, NodeParams params, std::vector<NodeIdxPair> inputs)
+{
+    ARM_COMPUTE_ERROR_ON(inputs.size() == 0);
+
+    NodeID nid = g.add_node<DepthConcatenateLayerNode>(inputs.size());
+
+    unsigned int i = 0;
+    for(const auto &input : inputs)
+    {
+        CHECK_NODEIDX_PAIR(input, g);
+        g.add_connection(input.node_id, input.index, nid, i++);
+    }
+    set_node_params(g, nid, params);
+
+    return nid;
+}
+
+NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params, NodeIdxPair input, Size2D kernel_spatial_extend, PadStrideInfo conv_info,
+                                                    DepthwiseConvolutionMethod method,
+                                                    ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor, const QuantizationInfo quant_info)
+{
+    CHECK_NODEIDX_PAIR(input, g);
+    ARM_COMPUTE_ERROR_ON((kernel_spatial_extend.width == 0) || (kernel_spatial_extend.height == 0));
+
+    bool has_bias = (bias_accessor != nullptr);
+
+    // Get input tensor descriptor
+    const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+
+    // Create weights node
+    TensorDescriptor w_desc = input_tensor_desc;
+    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::WIDTH), kernel_spatial_extend.width);
+    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
+    w_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL),
+                     get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL));
+    if(!quant_info.empty())
+    {
+        w_desc.quant_info = quant_info;
+    }
+
+    NodeID w_nid = add_const_node_with_name(g, params, "Weights", w_desc, std::move(weights_accessor));
+
+    // Create bias nodes
+    NodeID b_nid = EmptyNodeID;
+    if(has_bias)
+    {
+        TensorDescriptor b_desc = input_tensor_desc;
+        b_desc.shape            = TensorShape(b_desc.shape.z());
+        b_nid                   = add_const_node_with_name(g, params, "Bias", b_desc, std::move(bias_accessor));
+    }
+
+    // Create convolution node and connect
+    NodeID conv_nid = g.add_node<DepthwiseConvolutionLayerNode>(conv_info, method);
+    g.add_connection(input.node_id, input.index, conv_nid, 0);
+    g.add_connection(w_nid, 0, conv_nid, 1);
+    if(has_bias)
+    {
+        g.add_connection(b_nid, 0, conv_nid, 2);
+    }
+    set_node_params(g, conv_nid, params);
+
+    return conv_nid;
+}
+
+NodeID GraphBuilder::add_elementwise_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation)
+{
+    CHECK_NODEIDX_PAIR(input0, g);
+    CHECK_NODEIDX_PAIR(input1, g);
+
+    NodeID nid = g.add_node<EltwiseLayerNode>(operation);
+
+    g.add_connection(input0.node_id, input0.index, nid, 0);
+    g.add_connection(input1.node_id, input1.index, nid, 1);
+
+    set_node_params(g, nid, params);
+
+    return nid;
+}
+
+NodeID GraphBuilder::add_flatten_node(Graph &g, NodeParams params, NodeIdxPair input)
+{
+    return create_simple_single_input_output_node<FlattenLayerNode>(g, params, input);
+}
+
+NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
+                                               ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor)
+{
+    CHECK_NODEIDX_PAIR(input, g);
+    ARM_COMPUTE_ERROR_ON(num_outputs == 0);
+
+    bool has_bias = (bias_accessor != nullptr);
+
+    // Get input tensor descriptor
+    const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+
+    // Create weights node
+    TensorDescriptor w_desc = FullyConnectedLayerNode::compute_weights_descriptor(input_tensor_desc, num_outputs);
+    NodeID           w_nid  = add_const_node_with_name(g, params, "Weights", w_desc, std::move(weights_accessor));
+
+    // Create bias nodes
+    NodeID b_nid = EmptyNodeID;
+    if(has_bias)
+    {
+        TensorDescriptor b_desc = input_tensor_desc;
+        b_desc.shape            = TensorShape(num_outputs);
+        b_nid                   = add_const_node_with_name(g, params, "Bias", b_desc, std::move(bias_accessor));
+    }
+
+    // Create convolution node and connect
+    NodeID fc_nid = g.add_node<FullyConnectedLayerNode>(num_outputs);
+    g.add_connection(input.node_id, input.index, fc_nid, 0);
+    g.add_connection(w_nid, 0, fc_nid, 1);
+    if(has_bias)
+    {
+        g.add_connection(b_nid, 0, fc_nid, 2);
+    }
+
+    set_node_params(g, fc_nid, params);
+
+    return fc_nid;
+}
+
+NodeID GraphBuilder::add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info)
+{
+    return create_simple_single_input_output_node<NormalizationLayerNode>(g, params, input, norm_info);
+}
+
+NodeID GraphBuilder::add_pooling_node(Graph &g, NodeParams params, NodeIdxPair input, PoolingLayerInfo pool_info)
+{
+    return create_simple_single_input_output_node<PoolingLayerNode>(g, params, input, pool_info);
+}
+
+NodeID GraphBuilder::add_reshape_node(Graph &g, NodeParams params, NodeIdxPair input, TensorShape shape)
+{
+    return create_simple_single_input_output_node<ReshapeLayerNode>(g, params, input, shape);
+}
+
+NodeID GraphBuilder::add_scale_layer(Graph &g, const NodeParams &params, NodeIdxPair input, ITensorAccessorUPtr mul_accessor, ITensorAccessorUPtr add_accessor)
+{
+    CHECK_NODEIDX_PAIR(input, g);
+
+    // Get input tensor descriptor
+    const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+
+    // Create mul node
+    TensorDescriptor mul_desc = input_tensor_desc;
+    const size_t     C        = input_tensor_desc.shape[get_dimension_idx(mul_desc, DataLayoutDimension::CHANNEL)];
+    mul_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::WIDTH), 1);
+    mul_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::HEIGHT), 1);
+    mul_desc.shape.set(get_dimension_idx(input_tensor_desc, DataLayoutDimension::CHANNEL), C);
+    NodeID      mul_const_nid   = add_const_node_with_name(g, params, "Mul", mul_desc, std::move(mul_accessor));
+    NodeIdxPair mul_const_nidxp = { mul_const_nid, 0 };
+
+    // Create add node
+    TensorDescriptor add_desc        = mul_desc;
+    NodeID           add_const_nid   = add_const_node_with_name(g, params, "Add", add_desc, std::move(add_accessor));
+    NodeIdxPair      add_const_nidxp = { add_const_nid, 0 };
+
+    // Create node and connect
+    NodeID      mul_node      = GraphBuilder::add_elementwise_node(g, params, input, mul_const_nidxp, EltwiseOperation::MUL);
+    NodeIdxPair mulnode_nidxp = { mul_node, 0 };
+    NodeID      add_node      = GraphBuilder::add_elementwise_node(g, params, mulnode_nidxp, add_const_nidxp, EltwiseOperation::ADD);
+
+    return add_node;
+}
+
+NodeID GraphBuilder::add_softmax_node(Graph &g, NodeParams params, NodeIdxPair input, float beta)
+{
+    return create_simple_single_input_output_node<SoftmaxLayerNode>(g, params, input, beta);
+}
+
+NodeID GraphBuilder::add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis)
+{
+    return create_simple_single_input_output_node<SplitLayerNode>(g, params, input, num_splits, axis);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/GraphContext.cpp b/src/graph/GraphContext.cpp
index bfc6fcd..3f31114 100644
--- a/src/graph/GraphContext.cpp
+++ b/src/graph/GraphContext.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,45 +22,64 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph/GraphContext.h"
+#include <arm_compute/graph.h>
 
-using namespace arm_compute::graph;
-
-GraphHints::GraphHints(TargetHint target_hint, ConvolutionMethodHint conv_method_hint)
-    : _target_hint(target_hint), _convolution_method_hint(conv_method_hint)
+namespace arm_compute
 {
-}
-
-void GraphHints::set_target_hint(TargetHint target_hint)
+namespace graph
 {
-    _target_hint = target_hint;
-}
-
-void GraphHints::set_convolution_method_hint(ConvolutionMethodHint convolution_method)
-{
-    _convolution_method_hint = convolution_method;
-}
-
-TargetHint GraphHints::target_hint() const
-{
-    return _target_hint;
-}
-
-ConvolutionMethodHint GraphHints::convolution_method_hint() const
-{
-    return _convolution_method_hint;
-}
-
 GraphContext::GraphContext()
-    : _hints()
+    : _config(), _memory_managers()
 {
 }
 
-GraphHints &GraphContext::hints()
+const GraphConfig &GraphContext::config() const
 {
-    return _hints;
+    return _config;
 }
 
-const GraphHints &GraphContext::hints() const
+void GraphContext::set_config(const GraphConfig &config)
 {
-    return _hints;
-}
\ No newline at end of file
+    _config = config;
+}
+
+bool GraphContext::insert_memory_management_ctx(MemoryManagerContext &&memory_ctx)
+{
+    Target target = memory_ctx.target;
+    if(target == Target::UNSPECIFIED || _memory_managers.find(target) != std::end(_memory_managers))
+    {
+        return false;
+    }
+
+    _memory_managers[target] = std::move(memory_ctx);
+    return true;
+}
+
+MemoryManagerContext *GraphContext::memory_management_ctx(Target target)
+{
+    return (_memory_managers.find(target) != std::end(_memory_managers)) ? &_memory_managers[target] : nullptr;
+}
+
+std::map<Target, MemoryManagerContext> &GraphContext::memory_managers()
+{
+    return _memory_managers;
+}
+
+void GraphContext::finalize()
+{
+    for(auto &mm_obj : _memory_managers)
+    {
+        // Finalize intra layer memory manager
+        if(mm_obj.second.intra_mm != nullptr)
+        {
+            mm_obj.second.intra_mm->finalize();
+        }
+        // Finalize cross layer memory manager
+        if(mm_obj.second.cross_mm != nullptr)
+        {
+            mm_obj.second.cross_mm->finalize();
+        }
+    }
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/GraphManager.cpp b/src/graph/GraphManager.cpp
new file mode 100644
index 0000000..ad45845
--- /dev/null
+++ b/src/graph/GraphManager.cpp

@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/GraphManager.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/PassManager.h"
+#include "arm_compute/graph/Utils.h"
+#include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h"
+#include "arm_compute/graph/detail/ExecutionHelpers.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+GraphManager::GraphManager()
+    : _workloads()
+{
+    detail::default_initialize_backends();
+}
+
+void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &pm, Target target)
+{
+    // Setup graph context if not done manually
+    setup_default_graph_context(ctx);
+
+    // Check if graph has been registered
+    if(_workloads.find(graph.id()) != std::end(_workloads))
+    {
+        ARM_COMPUTE_ERROR("Graph is already registered!");
+    }
+
+    // Force target to all graph construct
+    Target forced_target = is_target_supported(target) ? target : get_default_target();
+    force_target_to_graph(graph, forced_target);
+
+    // Configure all tensors
+    detail::configure_all_tensors(graph);
+
+    // Apply all mutating passes
+    pm.run_all(graph);
+
+    // Validate all nodes
+    detail::validate_all_nodes(graph);
+
+    // Configure all nodes
+    auto workload = detail::configure_all_nodes(graph, ctx);
+    ARM_COMPUTE_ERROR_ON_MSG(workload.tasks.empty(), "Could not configure all nodes!");
+
+    // Allocate const tensors and call accessors
+    detail::allocate_const_tensors(graph);
+    detail::call_all_const_node_accessors(graph);
+
+    if(forced_target == Target::CL)
+    {
+        // Prepare graph
+        detail::prepare_all_tasks(workload);
+    }
+
+    // Setup tensor memory (Allocate all tensors or setup transition manager)
+    if(ctx.config().use_transition_memory_manager)
+    {
+        detail::configure_transition_manager(graph, ctx, workload);
+    }
+    else
+    {
+        detail::allocate_all_tensors(graph);
+    }
+
+    // Finalize Graph context
+    ctx.finalize();
+
+    // Register graph
+    _workloads.insert(std::make_pair(graph.id(), std::move(workload)));
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Created workload for graph with ID : " << graph.id().get() << std::endl);
+
+    if(forced_target != Target::CL)
+    {
+        // Make first run
+        execute_graph(graph);
+
+        // Release all unused const tensors
+        detail::release_unused_tensors(graph);
+    }
+}
+
+void GraphManager::execute_graph(Graph &graph)
+{
+    // Check if graph is finalized
+    auto it = _workloads.find(graph.id());
+    ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_workloads), "Graph is not registered!");
+
+    // Call input accessors
+    detail::call_all_input_node_accessors(it->second);
+
+    // Run graph
+    detail::call_all_tasks(it->second);
+
+    // Call output accessors
+    detail::call_all_output_node_accessors(it->second);
+}
+
+void GraphManager::invalidate_graph(Graph &graph)
+{
+    auto it = _workloads.find(graph.id());
+    ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_workloads), "Graph is not registered!");
+
+    _workloads.erase(it);
+}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp
index c753f66..cd9a46a 100644
--- a/src/graph/INode.cpp
+++ b/src/graph/INode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,33 +23,176 @@
  */
 #include "arm_compute/graph/INode.h"
 
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/graph/Edge.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Tensor.h"
 
-using namespace arm_compute::graph;
-
-TargetHint INode::override_target_hint(TargetHint target_hint) const
+namespace arm_compute
 {
-    if(target_hint == TargetHint::OPENCL && !opencl_is_available())
+namespace graph
+{
+// *INDENT-OFF*
+// clang-format off
+INode::INode()
+    : _graph(nullptr), _id(EmptyNodeID), _common_params({ "", Target::UNSPECIFIED}),
+      _outputs(), _input_edges(), _output_edges(), _assigned_target(Target::UNSPECIFIED)
+{
+}
+// clang-format on
+// *INDENT-ON*
+
+Status INode::validate() const
+{
+    return Status{};
+}
+
+void INode::set_graph(Graph *g)
+{
+    ARM_COMPUTE_ERROR_ON(g == nullptr);
+    _graph = g;
+}
+
+void INode::set_id(NodeID id)
+{
+    _id = id;
+}
+
+void INode::set_common_node_parameters(NodeParams common_params)
+{
+    _common_params = std::move(common_params);
+}
+
+void INode::set_requested_target(Target target)
+{
+    _common_params.target = target;
+}
+
+void INode::set_assigned_target(Target target)
+{
+    _assigned_target = target;
+}
+
+void INode::set_output_tensor(TensorID tid, size_t idx)
+{
+    if(tid != NullTensorID && (idx < _outputs.size()) && (_graph->tensor(tid) != nullptr))
     {
-        target_hint = TargetHint::DONT_CARE;
+        ARM_COMPUTE_ERROR_ON(_graph == nullptr);
+        Tensor *updated_tensor = _graph->tensor(tid);
+        _outputs[idx]          = tid;
+
+        // Set tensor to all output edges of the node
+        for(auto &output_edge_id : _output_edges)
+        {
+            auto output_edge = _graph->edge(output_edge_id);
+            if(output_edge != nullptr)
+            {
+                // Unbind edge from current tensor
+                auto current_output_tensor = output_edge->tensor();
+                current_output_tensor->unbind_edge(output_edge->id());
+
+                // Update tensor to edge and rebind tensor
+                output_edge->update_bound_tensor(updated_tensor);
+                updated_tensor->bind_edge(output_edge->id());
+            }
+        }
     }
-    GraphHints hints{ target_hint };
-    target_hint = node_override_hints(hints).target_hint();
-    ARM_COMPUTE_ERROR_ON(target_hint == TargetHint::OPENCL && !opencl_is_available());
-    return target_hint;
 }
-bool INode::supports_in_place() const
+
+NodeID INode::id() const
 {
-    return _supports_in_place;
+    return _id;
 }
-void INode::set_supports_in_place(bool value)
+
+std::string INode::name() const
 {
-    _supports_in_place = value;
+    return _common_params.name;
 }
-GraphHints INode::node_override_hints(GraphHints hints) const
+
+const Graph *INode::graph() const
 {
-    TargetHint target_hint = hints.target_hint();
-    hints.set_target_hint((target_hint == TargetHint::DONT_CARE) ? TargetHint::NEON : target_hint);
-    return hints;
+    return _graph;
 }
+
+Graph *INode::graph()
+{
+    return _graph;
+}
+
+const std::vector<TensorID> &INode::outputs() const
+{
+    return _outputs;
+}
+
+const std::vector<EdgeID> &INode::input_edges() const
+{
+    return _input_edges;
+}
+
+const std::set<EdgeID> &INode::output_edges() const
+{
+    return _output_edges;
+}
+
+TensorID INode::input_id(size_t idx) const
+{
+    ARM_COMPUTE_ERROR_ON(idx >= _input_edges.size());
+    Edge *e = _graph->edge(_input_edges[idx]);
+    return (e != nullptr) ? e->tensor_id() : NullTensorID;
+}
+
+TensorID INode::output_id(size_t idx) const
+{
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+    return _outputs[idx];
+}
+
+Tensor *INode::input(size_t idx) const
+{
+    ARM_COMPUTE_ERROR_ON(_graph == nullptr);
+    ARM_COMPUTE_ERROR_ON(idx >= _input_edges.size());
+    Edge *e = _graph->edge(_input_edges[idx]);
+    return (e != nullptr) ? e->tensor() : nullptr;
+}
+
+Tensor *INode::output(size_t idx) const
+{
+    ARM_COMPUTE_ERROR_ON(_graph == nullptr);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+    return _graph->tensor(_outputs[idx]);
+}
+
+EdgeID INode::input_edge_id(size_t idx) const
+{
+    ARM_COMPUTE_ERROR_ON(idx >= _input_edges.size());
+    return _input_edges[idx];
+}
+
+Edge *INode::input_edge(size_t idx) const
+{
+    ARM_COMPUTE_ERROR_ON(_graph == nullptr);
+    ARM_COMPUTE_ERROR_ON(idx >= _input_edges.size());
+    return _graph->edge(_input_edges[idx]);
+}
+
+size_t INode::num_inputs() const
+{
+    return _input_edges.size();
+}
+
+size_t INode::num_outputs() const
+{
+    return _outputs.size();
+}
+
+Target INode::requested_target() const
+{
+    return _common_params.target;
+}
+
+Target INode::assigned_target() const
+{
+    return _assigned_target;
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/NodeContext.cpp b/src/graph/NodeContext.cpp
deleted file mode 100644
index 2aa5aa1..0000000
--- a/src/graph/NodeContext.cpp
+++ /dev/null

@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/NodeContext.h"
-
-using namespace arm_compute::graph;
-
-void NodeContext::set_target(TargetHint target)
-{
-    _target = target;
-}
-
-void NodeContext::add_input(arm_compute::ITensor *input)
-{
-    ARM_COMPUTE_ERROR_ON(input == nullptr);
-    _inputs.emplace_back(input);
-}
-
-void NodeContext::add_output(arm_compute::ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
-    _outputs.emplace_back(output);
-}
-
-OperationType NodeContext::operation() const
-{
-    return _operation;
-}
-
-TargetHint NodeContext::target() const
-{
-    return _target;
-}
-
-arm_compute::ITensor *NodeContext::input(size_t idx) const
-{
-    ARM_COMPUTE_ERROR_ON(idx >= _inputs.size());
-    return _inputs[idx];
-}
-
-arm_compute::ITensor *NodeContext::output(size_t idx) const
-{
-    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
-    return _outputs[idx];
-}
-
-size_t NodeContext::num_inputs() const
-{
-    return _inputs.size();
-}
-
-size_t NodeContext::num_outputs() const
-{
-    return _outputs.size();
-}
\ No newline at end of file

diff --git a/src/graph/OperationRegistry.cpp b/src/graph/OperationRegistry.cpp
deleted file mode 100644
index 651653f..0000000
--- a/src/graph/OperationRegistry.cpp
+++ /dev/null

@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/OperationRegistry.h"
-
-using namespace arm_compute::graph;
-
-OperationRegistry::OperationRegistry()
-    : _registered_ops()
-{
-}
-
-OperationRegistry &OperationRegistry::get()
-{
-    static OperationRegistry instance;
-    return instance;
-}
-
-IOperation *OperationRegistry::find_operation(OperationType operation, TargetHint target)
-{
-    ARM_COMPUTE_ERROR_ON(!contains(operation, target));
-    auto it = std::find_if(_registered_ops[operation].begin(), _registered_ops[operation].end(), [&](const std::unique_ptr<IOperation> &op)
-    {
-        return (op->target() == target);
-    });
-    ARM_COMPUTE_ERROR_ON(it == _registered_ops[operation].end());
-    return (*it).get();
-}
-
-bool OperationRegistry::contains(OperationType operation, TargetHint target) const
-{
-    auto it = _registered_ops.find(operation);
-    if(it != _registered_ops.end())
-    {
-        return std::any_of(it->second.begin(), it->second.end(), [&](const std::unique_ptr<IOperation> &op)
-        {
-            return (op->target() == target);
-        });
-    }
-    return false;
-}

diff --git a/src/graph/PassManager.cpp b/src/graph/PassManager.cpp
new file mode 100644
index 0000000..8ed68bd
--- /dev/null
+++ b/src/graph/PassManager.cpp

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/PassManager.h"
+
+#include "arm_compute/graph/Logger.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+PassManager::PassManager()
+    : _passes()
+{
+}
+
+const std::vector<std::unique_ptr<IGraphMutator>> &PassManager::passes() const
+{
+    return _passes;
+}
+
+IGraphMutator *PassManager::pass(size_t index)
+{
+    return (index >= _passes.size()) ? nullptr : _passes.at(index).get();
+}
+
+void PassManager::append(std::unique_ptr<IGraphMutator> pass)
+{
+    if(pass)
+    {
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Appending mutating pass : " << pass->name() << std::endl);
+        _passes.push_back(std::move(pass));
+    }
+}
+
+void PassManager::clear()
+{
+    _passes.clear();
+}
+
+void PassManager::run_all(Graph &g)
+{
+    for(auto &pass : _passes)
+    {
+        if(pass)
+        {
+            ARM_COMPUTE_LOG_GRAPH_INFO("Running mutating pass : " << pass->name() << std::endl);
+            pass->mutate(g);
+        }
+    }
+}
+
+void PassManager::run(Graph &g, size_t index)
+{
+    if(index >= _passes.size())
+    {
+        return;
+    }
+
+    auto &pass = _passes.at(index);
+
+    if(pass != nullptr)
+    {
+        pass->mutate(g);
+    }
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/SubGraph.cpp b/src/graph/SubGraph.cpp
deleted file mode 100644
index 4065e1d..0000000
--- a/src/graph/SubGraph.cpp
+++ /dev/null

@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/SubGraph.h"
-
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/INode.h"
-#include "arm_compute/graph/Tensor.h"
-
-using namespace arm_compute::graph;
-
-SubGraph::SubGraph()
-    : _nodes(), _input(nullptr), _output(nullptr)
-{
-}
-
-void SubGraph::add_node(std::unique_ptr<INode> node)
-{
-    _nodes.push_back(std::move(node));
-}
-
-void SubGraph::add_tensor_object(std::unique_ptr<ITensorObject> tensor)
-{
-    // If it's the first Tensor added then it will be the input of the Graph.
-    if(_input == nullptr)
-    {
-        _input = std::move(tensor);
-    }
-    else
-    {
-        _output = std::move(tensor);
-    }
-}
-
-std::unique_ptr<Graph> SubGraph::construct(const GraphContext &ctx, std::unique_ptr<ITensorObject> input, std::unique_ptr<ITensorObject> output)
-{
-    auto graph = arm_compute::support::cpp14::make_unique<Graph>();
-
-    // Set hint
-    graph->hints() = ctx.hints();
-
-    // Configure input
-    if(_input == nullptr)
-    {
-        _input = std::move(input);
-    }
-    graph->add_tensor_object(std::move(_input));
-
-    // Make sure first and last nodes of the subgraph always do operations out-of-place
-    _nodes.front()->set_supports_in_place(false);
-    _nodes.back()->set_supports_in_place(false);
-
-    // Construct nodes
-    for(auto &node : _nodes)
-    {
-        graph->add_node(std::move(node));
-    }
-
-    // Configure output
-    if(_output == nullptr)
-    {
-        _output = std::move(output);
-    }
-    graph->add_tensor_object(std::move(_output));
-
-    return graph;
-}
-
-bool SubGraph::has_input() const
-{
-    return _input != nullptr;
-}
-
-bool SubGraph::has_output() const
-{
-    return _output != nullptr;
-}
-
-SubGraph &arm_compute::graph::operator<<(SubGraph &graph, Tensor &&tensor)
-{
-    graph.add_tensor_object(arm_compute::support::cpp14::make_unique<Tensor>(std::move(tensor)));
-    return graph;
-}
-
-SubGraph &arm_compute::graph::operator<<(SubGraph &graph, SubTensor &&sub_tensor)
-{
-    graph.add_tensor_object(arm_compute::support::cpp14::make_unique<SubTensor>(std::move(sub_tensor)));
-    return graph;
-}

diff --git a/src/graph/SubTensor.cpp b/src/graph/SubTensor.cpp
deleted file mode 100644
index 2e640dd..0000000
--- a/src/graph/SubTensor.cpp
+++ /dev/null

@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/SubTensor.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLSubTensor.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/SubTensor.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "utils/TypePrinter.h"
-
-using namespace arm_compute::graph;
-
-namespace
-{
-template <typename SubTensorType, typename ParentTensorType>
-std::unique_ptr<arm_compute::ITensor> initialise_subtensor(arm_compute::ITensor *parent, TensorShape shape, Coordinates coords, bool extend_parent)
-{
-    auto ptensor   = dynamic_cast<ParentTensorType *>(parent);
-    auto subtensor = arm_compute::support::cpp14::make_unique<SubTensorType>(ptensor, shape, coords, extend_parent);
-    return std::move(subtensor);
-}
-} // namespace
-
-SubTensor::SubTensor()
-    : _target(TargetHint::DONT_CARE), _tensor_shape(), _coords(), _parent(nullptr), _subtensor(nullptr), _extend_parent(false)
-{
-}
-
-SubTensor::SubTensor(Tensor &parent, TensorShape tensor_shape, Coordinates coords, bool extend_parent)
-    : _target(TargetHint::DONT_CARE), _tensor_shape(tensor_shape), _coords(coords), _parent(nullptr), _subtensor(nullptr), _extend_parent(extend_parent)
-{
-    ARM_COMPUTE_ERROR_ON(parent.tensor() == nullptr);
-    _parent = parent.tensor();
-    _target = parent.target();
-
-    instantiate_subtensor();
-}
-
-SubTensor::SubTensor(arm_compute::ITensor *parent, TensorShape tensor_shape, Coordinates coords, TargetHint target, bool extend_parent)
-    : _target(target), _tensor_shape(tensor_shape), _coords(coords), _parent(parent), _subtensor(nullptr), _extend_parent(extend_parent)
-{
-    ARM_COMPUTE_ERROR_ON(parent == nullptr);
-    instantiate_subtensor();
-}
-
-bool SubTensor::call_accessor()
-{
-    return true;
-}
-
-bool SubTensor::has_accessor() const
-{
-    return false;
-}
-
-arm_compute::ITensor *SubTensor::set_target(TargetHint target)
-{
-    ARM_COMPUTE_ERROR_ON(target != _target);
-    return (target == _target) ? _subtensor.get() : nullptr;
-}
-
-arm_compute::ITensor *SubTensor::tensor()
-{
-    return _subtensor.get();
-}
-
-const arm_compute::ITensor *SubTensor::tensor() const
-{
-    return _subtensor.get();
-}
-
-TargetHint SubTensor::target() const
-{
-    return _target;
-}
-
-void SubTensor::allocate()
-{
-    // NOP for sub-tensors
-}
-
-void SubTensor::instantiate_subtensor()
-{
-    switch(_target)
-    {
-        case TargetHint::OPENCL:
-            _subtensor = initialise_subtensor<arm_compute::CLSubTensor, arm_compute::ICLTensor>(_parent, _tensor_shape, _coords, _extend_parent);
-            break;
-        case TargetHint::NEON:
-            _subtensor = initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(_parent, _tensor_shape, _coords, _extend_parent);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Invalid TargetHint");
-    }
-}

diff --git a/src/graph/Tensor.cpp b/src/graph/Tensor.cpp
index 4db79e9..287e783 100644
--- a/src/graph/Tensor.cpp
+++ b/src/graph/Tensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,138 +23,89 @@
  */
 #include "arm_compute/graph/Tensor.h"
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "utils/TypePrinter.h"
-
-using namespace arm_compute::graph;
-
-namespace
+namespace arm_compute
 {
-template <typename TensorType>
-std::unique_ptr<arm_compute::ITensor> initialise_tensor(TensorInfo &info)
+namespace graph
 {
-    auto tensor = arm_compute::support::cpp14::make_unique<TensorType>();
-    tensor->allocator()->init(info);
-    return std::move(tensor);
-}
-
-template <typename TensorType>
-void tensor_allocate(arm_compute::ITensor &tensor)
-{
-    auto itensor = dynamic_cast<TensorType *>(&tensor);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(itensor);
-    itensor->allocator()->allocate();
-}
-} // namespace
-
-Tensor::Tensor(TensorInfo &&info)
-    : _target(TargetHint::DONT_CARE), _info(info), _accessor(nullptr), _tensor(nullptr)
+Tensor::Tensor(TensorID id, TensorDescriptor desc)
+    : _id(id), _desc(std::move(desc)), _handle(nullptr), _accessor(nullptr), _bound_edges()
 {
 }
 
-Tensor::Tensor(Tensor &&src) noexcept
-    : _target(src._target),
-      _info(std::move(src._info)),
-      _accessor(std::move(src._accessor)),
-      _tensor(std::move(src._tensor))
+TensorID Tensor::id() const
 {
+    return _id;
 }
 
-void Tensor::set_info(TensorInfo &&info)
+TensorDescriptor &Tensor::desc()
 {
-    _info = info;
+    return _desc;
+}
+
+const TensorDescriptor &Tensor::desc() const
+{
+    return _desc;
+}
+
+void Tensor::set_handle(std::unique_ptr<ITensorHandle> backend_tensor)
+{
+    _handle = std::move(backend_tensor);
+}
+
+ITensorHandle *Tensor::handle()
+{
+    return _handle.get();
+}
+
+void Tensor::set_accessor(std::unique_ptr<ITensorAccessor> accessor)
+{
+    _accessor = std::move(accessor);
+}
+
+ITensorAccessor *Tensor::accessor()
+{
+    return _accessor.get();
 }
 
 bool Tensor::call_accessor()
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_accessor.get());
-    auto cl_tensor = dynamic_cast<arm_compute::CLTensor *>(_tensor.get());
-    if(cl_tensor != nullptr && cl_tensor->buffer() == nullptr)
+    // Early exit guard
+    if(!_accessor || !_handle)
     {
-        cl_tensor->map();
+        return false;
     }
-    bool retval = _accessor->access_tensor(*_tensor);
-    if(cl_tensor != nullptr)
+
+    // Map tensor
+    _handle->map(true);
+
+    // Return in case of null backend buffer
+    if(_handle->tensor().buffer() == nullptr)
     {
-        cl_tensor->unmap();
+        return false;
     }
-    return retval;
+
+    // Call accessor
+    _accessor->access_tensor(_handle->tensor());
+
+    // Unmap tensor
+    _handle->unmap();
+
+    return true;
 }
 
-bool Tensor::has_accessor() const
+void Tensor::bind_edge(EdgeID eid)
 {
-    return (_accessor != nullptr);
+    _bound_edges.insert(eid);
 }
 
-arm_compute::ITensor *Tensor::tensor()
+void Tensor::unbind_edge(EdgeID eid)
 {
-    return _tensor.get();
+    _bound_edges.erase(eid);
 }
 
-const arm_compute::ITensor *Tensor::tensor() const
+const std::set<EdgeID> Tensor::bound_edges() const
 {
-    return _tensor.get();
+    return _bound_edges;
 }
-
-const TensorInfo &Tensor::info() const
-{
-    return _info;
-}
-
-arm_compute::ITensor *Tensor::set_target(TargetHint target)
-{
-    if(_tensor != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON(target != _target);
-    }
-    else
-    {
-        switch(target)
-        {
-            case TargetHint::OPENCL:
-                _tensor = initialise_tensor<arm_compute::CLTensor>(_info);
-                break;
-            case TargetHint::NEON:
-                _tensor = initialise_tensor<arm_compute::Tensor>(_info);
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Invalid TargetHint");
-        }
-        _target = target;
-    }
-    return _tensor.get();
-}
-
-void Tensor::allocate()
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor.get());
-    switch(_target)
-    {
-        case TargetHint::OPENCL:
-            tensor_allocate<arm_compute::CLTensor>(*_tensor);
-            break;
-        case TargetHint::NEON:
-            tensor_allocate<arm_compute::Tensor>(*_tensor);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Invalid TargetHint");
-    }
-}
-
-void Tensor::allocate_and_fill_if_needed()
-{
-    allocate();
-    if(_accessor != nullptr)
-    {
-        call_accessor();
-    }
-}
-
-TargetHint Tensor::target() const
-{
-    return _target;
-}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/Utils.cpp b/src/graph/Utils.cpp
new file mode 100644
index 0000000..030fa2d
--- /dev/null
+++ b/src/graph/Utils.cpp

@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/Utils.h"
+
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/mutators/GraphMutators.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+bool is_target_supported(Target target)
+{
+    return backends::BackendRegistry::get().contains(target) && backends::BackendRegistry::get().find_backend(target)->is_backend_supported();
+}
+
+Target get_default_target()
+{
+    if(is_target_supported(Target::NEON))
+    {
+        return Target::NEON;
+    }
+    if(is_target_supported(Target::CL))
+    {
+        return Target::CL;
+    }
+    if(is_target_supported(Target::GC))
+    {
+        return Target::GC;
+    }
+    ARM_COMPUTE_ERROR("No backend exists!");
+}
+
+void force_target_to_graph(Graph &g, Target target)
+{
+    auto &nodes = g.nodes();
+    for(auto &node : nodes)
+    {
+        if(node)
+        {
+            node->set_assigned_target(target);
+        }
+    }
+
+    auto &tensors = g.tensors();
+    for(auto &tensor : tensors)
+    {
+        if(tensor)
+        {
+            tensor->desc().target = target;
+        }
+    }
+}
+
+PassManager create_default_pass_manager(Target target)
+{
+    PassManager pm;
+
+    if(target != Target::GC)
+    {
+        pm.append(support::cpp14::make_unique<InPlaceOperationMutator>());
+        pm.append(support::cpp14::make_unique<NodeFusionMutator>());
+        pm.append(support::cpp14::make_unique<SplitLayerSubTensorMutator>());
+        pm.append(support::cpp14::make_unique<DepthConcatSubTensorMutator>());
+    }
+
+    return pm;
+}
+
+void setup_default_graph_context(GraphContext &ctx)
+{
+    for(const auto &backend : backends::BackendRegistry::get().backends())
+    {
+        backend.second->setup_backend_context(ctx);
+    }
+}
+
+size_t get_dimension_size(const TensorDescriptor &descriptor, const DataLayoutDimension data_layout_dimension)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(descriptor.layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
+    return descriptor.shape[get_dimension_idx(descriptor, data_layout_dimension)];
+}
+
+size_t get_dimension_idx(const TensorDescriptor &descriptor, const DataLayoutDimension data_layout_dimension)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(descriptor.layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
+
+    /* Return the index based on the data layout
+     * [N C H W]
+     * [3 2 1 0]
+     * [N H W C]
+     */
+    switch(data_layout_dimension)
+    {
+        case DataLayoutDimension::CHANNEL:
+            return (descriptor.layout == DataLayout::NCHW) ? 2 : 0;
+            break;
+        case DataLayoutDimension::HEIGHT:
+            return (descriptor.layout == DataLayout::NCHW) ? 1 : 2;
+            break;
+        case DataLayoutDimension::WIDTH:
+            return (descriptor.layout == DataLayout::NCHW) ? 0 : 1;
+            break;
+        case DataLayoutDimension::BATCHES:
+            return 3;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data layout index not supported!");
+            break;
+    }
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/CL/CLMap.cpp b/src/graph/Workload.cpp
similarity index 62%
copy from src/graph/CL/CLMap.cpp
copy to src/graph/Workload.cpp
index 5289ea9..d8046c3 100644
--- a/src/graph/CL/CLMap.cpp
+++ b/src/graph/Workload.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,23 +21,45 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/CL/CLMap.h"
+#include "arm_compute/graph/Workload.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/ITensorHandle.h"
 
-using namespace arm_compute::graph;
-
-CLMap::CLMap(ITensorObject *tensor, bool blocking)
-    : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor())), _blocking(blocking)
+namespace arm_compute
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
+namespace graph
+{
+void ExecutionTask::operator()()
+{
+    TaskExecutor::get().execute_function(*this);
 }
 
-void CLMap::run()
+void execute_task(ExecutionTask &task)
 {
-    _tensor->map(arm_compute::CLScheduler::get().queue(), _blocking);
+    if(task.task)
+    {
+        task.task->run();
+    }
 }
+
+void ExecutionTask::prepare()
+{
+    if(task)
+    {
+        task->prepare();
+    }
+}
+
+TaskExecutor::TaskExecutor()
+    : execute_function(execute_task)
+{
+}
+
+TaskExecutor &TaskExecutor::get()
+{
+    static TaskExecutor executor;
+    return executor;
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/backends/BackendRegistry.cpp b/src/graph/backends/BackendRegistry.cpp
new file mode 100644
index 0000000..2803322
--- /dev/null
+++ b/src/graph/backends/BackendRegistry.cpp

@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/BackendRegistry.h"
+
+using namespace arm_compute::graph::backends;
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+BackendRegistry::BackendRegistry()
+    : _registered_backends()
+{
+}
+
+BackendRegistry &BackendRegistry::get()
+{
+    static BackendRegistry instance;
+    return instance;
+}
+
+IDeviceBackend *BackendRegistry::find_backend(Target target)
+{
+    ARM_COMPUTE_ERROR_ON(!contains(target));
+    return _registered_backends[target].get();
+}
+
+bool BackendRegistry::contains(Target target) const
+{
+    auto it = _registered_backends.find(target);
+    return (it != _registered_backends.end());
+}
+
+const std::map<Target, std::unique_ptr<IDeviceBackend>> &BackendRegistry::backends() const
+{
+    return _registered_backends;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/backends/CL/CLDeviceBackend.cpp b/src/graph/backends/CL/CLDeviceBackend.cpp
new file mode 100644
index 0000000..bf17f80
--- /dev/null
+++ b/src/graph/backends/CL/CLDeviceBackend.cpp

@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/CL/CLDeviceBackend.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/backends/BackendRegistrar.h"
+#include "arm_compute/graph/backends/CL/CLFunctionFactory.h"
+#include "arm_compute/graph/backends/CL/CLNodeValidator.h"
+#include "arm_compute/graph/backends/CL/CLSubTensorHandle.h"
+#include "arm_compute/graph/backends/CL/CLTensorHandle.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/BlobLifetimeManager.h"
+#include "arm_compute/runtime/CL/CLBufferAllocator.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
+#include "arm_compute/runtime/PoolManager.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+namespace
+{
+bool file_exists(const std::string &filename)
+{
+    std::ifstream file(filename);
+    return file.good();
+}
+} // namespace
+
+/** Register CL backend */
+static detail::BackendRegistrar<CLDeviceBackend> CLDeviceBackend_registrar(Target::CL);
+
+/** Tuner export file */
+static const std::string tuner_data_filename = "acl_tuner.csv";
+
+CLDeviceBackend::CLDeviceBackend()
+    : _tuner(), _allocator(cl::Context::getDefault())
+{
+}
+
+CLDeviceBackend::~CLDeviceBackend()
+{
+    if(_tuner.tune_new_kernels() && !_tuner.lws_table().empty())
+    {
+        _tuner.save_to_file(tuner_data_filename);
+    }
+}
+
+void CLDeviceBackend::set_kernel_tuning(bool enable_tuning)
+{
+    _tuner.set_tune_new_kernels(enable_tuning);
+}
+
+void CLDeviceBackend::initialize_backend()
+{
+    // Load tuner data if available
+    if(_tuner.lws_table().empty() && file_exists(tuner_data_filename))
+    {
+        _tuner.load_from_file(tuner_data_filename);
+    }
+
+    // Setup Scheduler
+    CLScheduler::get().default_init(&_tuner);
+
+    // Create allocator with new context
+    _allocator = CLBufferAllocator();
+}
+
+void CLDeviceBackend::setup_backend_context(GraphContext &ctx)
+{
+    // Setup tuner
+    set_kernel_tuning(ctx.config().use_tuner);
+
+    // Setup a management backend
+    if(ctx.memory_management_ctx(Target::CL) == nullptr)
+    {
+        MemoryManagerContext mm_ctx;
+        mm_ctx.target      = Target::CL;
+        mm_ctx.intra_mm    = create_memory_manager(MemoryManagerAffinity::Buffer);
+        mm_ctx.cross_mm    = create_memory_manager(MemoryManagerAffinity::Buffer);
+        mm_ctx.cross_group = std::make_shared<CLMemoryGroup>(mm_ctx.cross_mm);
+
+        ctx.insert_memory_management_ctx(std::move(mm_ctx));
+    }
+}
+
+bool CLDeviceBackend::is_backend_supported()
+{
+    return arm_compute::opencl_is_available();
+}
+
+IAllocator *CLDeviceBackend::backend_allocator()
+{
+    return &_allocator;
+}
+
+std::unique_ptr<ITensorHandle> CLDeviceBackend::create_tensor(const Tensor &tensor)
+{
+    // Get tensor descriptor
+    const TensorDescriptor &tensor_desc = tensor.desc();
+    ARM_COMPUTE_ERROR_ON(tensor_desc.target != Target::CL);
+
+    // Create backend tensor handle
+    TensorInfo info(tensor_desc.shape, 1, tensor_desc.data_type, tensor_desc.quant_info);
+    info.set_data_layout(tensor_desc.layout);
+    auto backend_tensor_handle = support::cpp14::make_unique<CLTensorHandle>(info);
+
+    return std::move(backend_tensor_handle);
+}
+
+std::unique_ptr<ITensorHandle> CLDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
+{
+    if(parent == nullptr)
+    {
+        return nullptr;
+    }
+
+    return support::cpp14::make_unique<CLSubTensorHandle>(parent, shape, coords, extend_parent);
+}
+
+std::unique_ptr<arm_compute::IFunction> CLDeviceBackend::configure_node(INode &node, GraphContext &ctx)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Configuring CL node with ID : " << node.id() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.assigned_target() != Target::CL);
+
+    // Configure node
+    return CLFunctionFactory::create(&node, ctx);
+}
+
+arm_compute::Status CLDeviceBackend::validate_node(INode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating CL node with ID : " << node.id() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.assigned_target() != Target::CL);
+
+    return CLNodeValidator::validate(&node);
+}
+
+std::shared_ptr<arm_compute::IMemoryManager> CLDeviceBackend::create_memory_manager(MemoryManagerAffinity affinity)
+{
+    if(affinity == MemoryManagerAffinity::Offset)
+    {
+        ARM_COMPUTE_LOG_GRAPH_WARNING("CL Backend does not support offset affinity memory management!");
+        return nullptr;
+    }
+
+    auto lifetime_mgr = std::make_shared<BlobLifetimeManager>();
+    auto pool_mgr     = std::make_shared<PoolManager>();
+    auto mm           = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
+
+    mm->set_allocator(&_allocator);
+
+    return mm;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
new file mode 100644
index 0000000..db8a7a0
--- /dev/null
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp

@@ -0,0 +1,607 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/CL/CLFunctionFactory.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/TypePrinter.h"
+#include "arm_compute/graph/Types.h"
+#include "arm_compute/graph/backends/Utils.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/runtime/CL/CLFunctions.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute::utils::cast;
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+namespace
+{
+/** Returns backing tensor of a given tensor
+ *
+ * @param[in] tensor Tensor to extract the backing tensor from
+ *
+ * @return Backing tensor if present else nullptr
+ */
+arm_compute::ICLTensor *get_backing_tensor(arm_compute::graph::Tensor *tensor)
+{
+    arm_compute::ICLTensor *backing_tensor = nullptr;
+    if(tensor != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON(tensor->desc().target != arm_compute::graph::Target::CL);
+        // Get backing tensor handle
+        ITensorHandle *tensor_handle = tensor->handle();
+        // Get backing tensor
+        backing_tensor = (tensor_handle != nullptr) ? polymorphic_cast<ICLTensor *>(&tensor_handle->tensor()) : nullptr;
+    }
+
+    return backing_tensor;
+}
+
+/** Create a backend activation layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend activation layer function
+ */
+std::unique_ptr<IFunction> create_activation_layer(ActivationLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Creating CL ActivationLayerNode node with ID : " << node.id() << " and Name: " << node.name()
+        << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ICLTensor                *input    = get_backing_tensor(node.input(0));
+    ICLTensor                *output   = get_backing_tensor(node.output(0));
+    const ActivationLayerInfo act_info = node.activation_info();
+
+    // Create function
+    auto func = support::cpp14::make_unique<CLActivationLayer>();
+    func->configure(input, output, act_info);
+
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLActivationLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Shape: " << input->info()->tensor_shape()
+                               << " Activation function: " << act_info.activation()
+                               << " a: " << act_info.a()
+                               << " b: " << act_info.b()
+                               << " InPlace : " << is_in_place_operation(input, output)
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend batch normalization layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend batch normalization layer function
+ */
+std::unique_ptr<IFunction> create_batch_normalization_layer(BatchNormalizationLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating CL BatchNormalization node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 5);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ICLTensor                *input     = get_backing_tensor(node.input(0));
+    ICLTensor                *mean      = get_backing_tensor(node.input(1));
+    ICLTensor                *var       = get_backing_tensor(node.input(2));
+    ICLTensor                *beta      = get_backing_tensor(node.input(3));
+    ICLTensor                *gamma     = get_backing_tensor(node.input(4));
+    ICLTensor                *output    = get_backing_tensor(node.output(0));
+    const float               epsilon   = node.epsilon();
+    const ActivationLayerInfo fused_act = node.fused_activation();
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<CLBatchNormalizationLayer>();
+    func->configure(input, output, mean, var, beta, gamma, epsilon, fused_act);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLBatchNormalizationLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Shape: " << input->info()->tensor_shape()
+                               << " Epsilon: " << epsilon << " "
+                               << (fused_act.enabled() ? to_string(fused_act.activation()) : "")
+                               << " InPlace : " << is_in_place_operation(input, output)
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend convolution layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend convolution layer function
+ */
+std::unique_ptr<IFunction> create_convolution_layer(ConvolutionLayerNode &node, GraphContext &ctx)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating CL ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ICLTensor *input   = get_backing_tensor(node.input(0));
+    ICLTensor *weights = get_backing_tensor(node.input(1));
+    ICLTensor *biases  = get_backing_tensor(node.input(2));
+    ICLTensor *output  = get_backing_tensor(node.output(0));
+
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()))
+    {
+        biases->info()->set_data_type(DataType::S32);
+    }
+
+    const PadStrideInfo     conv_info      = node.convolution_info();
+    const ConvolutionMethod conv_algorithm = node.convolution_method();
+    const bool              fast_math      = node.fast_math_hint() == FastMathHint::ENABLED;
+
+    // Create and configure function (we assume that functions have been validated before creation)
+    std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, Target::CL);
+    std::unique_ptr<IFunction>      func;
+    std::string                     func_name;
+
+    if(conv_algorithm == ConvolutionMethod::WINOGRAD)
+    {
+        std::tie(func, func_name) = create_named_memory_managed_function<CLWinogradConvolutionLayer>(
+                                        std::string("CLWinogradConvolutionLayer"), mm, input, weights, biases, output, conv_info, ActivationLayerInfo(), fast_math);
+    }
+    else if(conv_algorithm == ConvolutionMethod::DIRECT)
+    {
+        std::tie(func, func_name) = create_named_function<CLDirectConvolutionLayer>(
+                                        std::string("CLDirectConvolutionLayer"), input, weights, biases, output, conv_info);
+    }
+    else if(conv_algorithm == ConvolutionMethod::GEMM)
+    {
+        std::tie(func, func_name) = create_named_memory_managed_function<CLGEMMConvolutionLayer>(std::string("CLGEMMConvolutionLayer"), mm,
+                                                                                                 input, weights, biases, output, conv_info);
+    }
+    else
+    {
+        std::tie(func, func_name) = create_named_memory_managed_function<CLConvolutionLayer>(std::string("CLConvolutionLayer"), mm,
+                                                                                             input, weights, biases, output, conv_info, WeightsInfo(), Size2D(1U, 1U), ActivationLayerInfo(), fast_math);
+    }
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input QuantInfo: " << input->info()->quantization_info()
+                               << " Weights QuantInfo: " << weights->info()->quantization_info()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+    return func;
+}
+
+/** Create a backend layer depth concatenate function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend depth concatenate layer function
+ */
+std::unique_ptr<arm_compute::IFunction> create_depth_concatenate_layer(DepthConcatenateLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating CL DepthConcatenate node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Return nullptr if depth concatenate is switched off
+    if(!node.is_enabled())
+    {
+        return nullptr;
+    }
+
+    // Extract IO and info
+    std::vector<arm_compute::ICLTensor *> inputs;
+    for(unsigned int i = 0; i < node.num_inputs(); ++i)
+    {
+        inputs.push_back(get_backing_tensor(node.input(i)));
+    }
+    ICLTensor *output = get_backing_tensor(node.output(0));
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<CLDepthConcatenateLayer>();
+    func->configure(inputs, output);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLDepthConcatenateLayer"
+                               << " Data Type: " << output->info()->data_type()
+                               << " Shape: " << output->info()->tensor_shape()
+                               << " Num Inputs: " << inputs.size()
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend layer depth-wise convolution function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend depth-wise convolution layer function
+ */
+std::unique_ptr<IFunction> create_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Creating CL DepthwiseConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name()
+        << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ICLTensor *input   = get_backing_tensor(node.input(0));
+    ICLTensor *weights = get_backing_tensor(node.input(1));
+    ICLTensor *biases  = get_backing_tensor(node.input(2));
+    ICLTensor *output  = get_backing_tensor(node.output(0));
+
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()))
+    {
+        biases->info()->set_data_type(DataType::S32);
+    }
+
+    const PadStrideInfo              conv_info     = node.convolution_info();
+    const DepthwiseConvolutionMethod dwc_algorithm = node.depthwise_convolution_method();
+
+    // Create and configure function (we assume that functions have been validated before creation)
+    std::unique_ptr<IFunction> func;
+    std::string                func_name;
+    if(dwc_algorithm == DepthwiseConvolutionMethod::OPTIMIZED_3x3)
+    {
+        std::tie(func, func_name) = create_named_function<CLDepthwiseConvolutionLayer3x3>(
+                                        std::string("CLDepthwiseConvolutionLayer3x3"), input, weights, biases, output, conv_info);
+    }
+    else
+    {
+        std::tie(func, func_name) = create_named_function<CLDepthwiseConvolutionLayer>(
+                                        std::string("CLDepthwiseConvolutionLayer"), input, weights, biases, output, conv_info);
+    }
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input QuantInfo: " << input->info()->quantization_info()
+                               << " Weights QuantInfo: " << weights->info()->quantization_info()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+    return func;
+}
+
+/** Create a backend element-wise operation layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend element-wise operation layer function
+ */
+std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Creating CL EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 2);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ICLTensor             *input1         = get_backing_tensor(node.input(0));
+    ICLTensor             *input2         = get_backing_tensor(node.input(1));
+    ICLTensor             *output         = get_backing_tensor(node.output(0));
+    const EltwiseOperation eltwise_op     = node.eltwise_operation();
+    const ConvertPolicy    convert_policy = node.convert_policy();
+    ARM_COMPUTE_ERROR_ON(input1 == nullptr);
+    ARM_COMPUTE_ERROR_ON(input2 == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    std::unique_ptr<IFunction> func = nullptr;
+    std::string                func_name;
+    if(eltwise_op == EltwiseOperation::ADD)
+    {
+        std::tie(func, func_name) = create_named_function<CLArithmeticAddition>(std::string("CLArithmeticAddition"),
+                                                                                input1, input2, output,
+                                                                                convert_policy);
+    }
+    else if(eltwise_op == EltwiseOperation::SUB)
+    {
+        std::tie(func, func_name) = create_named_function<CLArithmeticSubtraction>(
+                                        std::string("CLArithmeticSubtraction"), input1, input2, output, convert_policy);
+    }
+    else if(eltwise_op == EltwiseOperation::MUL)
+    {
+        std::tie(func, func_name) = create_named_function<CLPixelWiseMultiplication>(
+                                        std::string("CLPixelWiseMultiplication"), input1, input2, output, 1.f, convert_policy,
+                                        node.rounding_policy());
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unsupported element-wise operation!");
+    }
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+                               << " Data Type: " << input1->info()->data_type()
+                               << " Shape : " << input1->info()->tensor_shape()
+                               << std::endl);
+
+    return func;
+}
+
+/** Create a backend flatten layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend flatten layer function
+ */
+std::unique_ptr<IFunction> create_flatten_layer(FlattenLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Creating CL FlattenLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ICLTensor *input  = get_backing_tensor(node.input(0));
+    ICLTensor *output = get_backing_tensor(node.output(0));
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<CLFlattenLayer>();
+    func->configure(input, output);
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLFlattenLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend fully connected layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend fully connected layer function
+ */
+std::unique_ptr<IFunction> create_fully_connected_layer(FullyConnectedLayerNode &node, GraphContext &ctx)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Creating CL FullyConnectedLayer node with ID : " << node.id() << " and Name: " << node.name()
+        << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ICLTensor *input   = get_backing_tensor(node.input(0));
+    ICLTensor *weights = get_backing_tensor(node.input(1));
+    ICLTensor *biases  = get_backing_tensor(node.input(2));
+    ICLTensor *output  = get_backing_tensor(node.output(0));
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<CLFullyConnectedLayer>(get_memory_manager(ctx, Target::CL));
+    func->configure(input, weights, biases, output);
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(weights == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLFullyConnectedLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << " Biases Shape: " << biases->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend normalization layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend normalization layer function
+ */
+std::unique_ptr<IFunction> create_normalization_layer(NormalizationLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Creating CL NormalizationLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ICLTensor                   *input     = get_backing_tensor(node.input(0));
+    ICLTensor                   *output    = get_backing_tensor(node.output(0));
+    const NormalizationLayerInfo norm_info = node.normalization_info();
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<CLNormalizationLayer>();
+    func->configure(input, output, norm_info);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLNormalizationLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << " Normalization info: " << norm_info.type()
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend pooling layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend pooling layer function
+ */
+std::unique_ptr<IFunction> create_pooling_layer(PoolingLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Creating CL PoolingLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ICLTensor             *input     = get_backing_tensor(node.input(0));
+    ICLTensor             *output    = get_backing_tensor(node.output(0));
+    const PoolingLayerInfo pool_info = node.pooling_info();
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<CLPoolingLayer>();
+    func->configure(input, output, pool_info);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLPoolingLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << " Pooling info: " << pool_info.pool_type()
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend reshape layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend reshape layer function
+ */
+std::unique_ptr<IFunction> create_reshape_layer(ReshapeLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Creating CL ReshapeLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ICLTensor *input  = get_backing_tensor(node.input(0));
+    ICLTensor *output = get_backing_tensor(node.output(0));
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<CLReshapeLayer>();
+    func->configure(input, output);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLReshapeLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend softmax layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend softmax layer function
+ */
+std::unique_ptr<IFunction> create_softmax_layer(SoftmaxLayerNode &node, GraphContext &ctx)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Creating CL SoftmaxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ICLTensor *input  = get_backing_tensor(node.input(0));
+    ICLTensor *output = get_backing_tensor(node.output(0));
+    const float beta   = node.beta();
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<CLSoftmaxLayer>(get_memory_manager(ctx, Target::CL));
+    func->configure(input, output, beta);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated CLSoftmaxLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(func);
+}
+} // namespace
+
+std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &ctx)
+{
+    if(node == nullptr)
+    {
+        return nullptr;
+    }
+
+    NodeType type = node->type();
+    switch(type)
+    {
+        case NodeType::ActivationLayer:
+            return create_activation_layer(*polymorphic_downcast<ActivationLayerNode *>(node));
+        case NodeType::BatchNormalizationLayer:
+            return create_batch_normalization_layer(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
+        case NodeType::ConvolutionLayer:
+            return create_convolution_layer(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
+        case NodeType::DepthConcatenateLayer:
+            return create_depth_concatenate_layer(*polymorphic_downcast<DepthConcatenateLayerNode *>(node));
+        case NodeType::DepthwiseConvolutionLayer:
+            return create_depthwise_convolution_layer(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+        case NodeType::EltwiseLayer:
+            return create_eltwise_layer(*polymorphic_downcast<EltwiseLayerNode *>(node));
+        case NodeType::FlattenLayer:
+            return create_flatten_layer(*polymorphic_downcast<FlattenLayerNode *>(node));
+        case NodeType::FullyConnectedLayer:
+            return create_fully_connected_layer(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+        case NodeType::NormalizationLayer:
+            return create_normalization_layer(*polymorphic_downcast<NormalizationLayerNode *>(node));
+        case NodeType::PoolingLayer:
+            return create_pooling_layer(*polymorphic_downcast<PoolingLayerNode *>(node));
+        case NodeType::ReshapeLayer:
+            return create_reshape_layer(*polymorphic_downcast<ReshapeLayerNode *>(node));
+        case NodeType::SoftmaxLayer:
+            return create_softmax_layer(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+        default:
+            return nullptr;
+    }
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
new file mode 100644
index 0000000..c16b2e6
--- /dev/null
+++ b/src/graph/backends/CL/CLNodeValidator.cpp

@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/CL/CLNodeValidator.h"
+
+#include "arm_compute/graph/backends/ValidateHelpers.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/runtime/CL/CLFunctions.h"
+
+using namespace arm_compute::utils::cast;
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+Status CLNodeValidator::validate(INode *node)
+{
+    if(node == nullptr)
+    {
+        return Status{};
+    }
+
+    NodeType type = node->type();
+    switch(type)
+    {
+        case NodeType::ConvolutionLayer:
+            return detail::validate_convolution_layer<CLConvolutionLayer,
+                   CLDirectConvolutionLayer,
+                   CLGEMMConvolutionLayer,
+                   CLWinogradConvolutionLayer>(*polymorphic_downcast<ConvolutionLayerNode *>(node));
+        case NodeType::DepthwiseConvolutionLayer:
+            return detail::validate_depthwise_convolution_layer<CLDepthwiseConvolutionLayer,
+                   CLDepthwiseConvolutionLayer3x3>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+        default:
+            return Status{};
+    }
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/backends/CL/CLSubTensorHandle.cpp b/src/graph/backends/CL/CLSubTensorHandle.cpp
new file mode 100644
index 0000000..016dca7
--- /dev/null
+++ b/src/graph/backends/CL/CLSubTensorHandle.cpp

@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/CL/CLSubTensorHandle.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+CLSubTensorHandle::CLSubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent)
+    : _sub_tensor(), _parent_handle(nullptr)
+{
+    ARM_COMPUTE_ERROR_ON(!parent_handle);
+    auto parent_tensor = arm_compute::utils::cast::polymorphic_downcast<ICLTensor *>(&parent_handle->tensor());
+    _sub_tensor        = arm_compute::CLSubTensor(parent_tensor, shape, coords, extend_parent);
+    _parent_handle     = parent_handle;
+}
+
+void CLSubTensorHandle::allocate()
+{
+    // noop
+}
+
+void CLSubTensorHandle::free()
+{
+    // noop
+}
+
+void CLSubTensorHandle::manage(IMemoryGroup *mg)
+{
+    ARM_COMPUTE_UNUSED(mg);
+    // noop
+}
+
+void CLSubTensorHandle::map(bool blocking)
+{
+    _sub_tensor.map(blocking);
+}
+
+void CLSubTensorHandle::unmap()
+{
+    _sub_tensor.unmap();
+}
+
+void CLSubTensorHandle::release_if_unused()
+{
+    // noop
+}
+
+const arm_compute::ITensor &CLSubTensorHandle::tensor() const
+{
+    return _sub_tensor;
+}
+
+arm_compute::ITensor &CLSubTensorHandle::tensor()
+{
+    return _sub_tensor;
+}
+
+ITensorHandle *CLSubTensorHandle::parent_handle()
+{
+    ARM_COMPUTE_ERROR_ON(_parent_handle == nullptr);
+    return _parent_handle->parent_handle();
+}
+
+bool CLSubTensorHandle::is_subtensor() const
+{
+    return true;
+}
+
+Target CLSubTensorHandle::target() const
+{
+    return Target::CL;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/backends/CL/CLTensorHandle.cpp b/src/graph/backends/CL/CLTensorHandle.cpp
new file mode 100644
index 0000000..fdb044c
--- /dev/null
+++ b/src/graph/backends/CL/CLTensorHandle.cpp

@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/CL/CLTensorHandle.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+CLTensorHandle::CLTensorHandle(const ITensorInfo &info)
+    : _tensor()
+{
+    _tensor.allocator()->init(info);
+}
+
+void CLTensorHandle::allocate()
+{
+    _tensor.allocator()->allocate();
+}
+
+void CLTensorHandle::free()
+{
+    _tensor.allocator()->free();
+}
+
+void CLTensorHandle::manage(IMemoryGroup *mg)
+{
+    if(mg != nullptr)
+    {
+        auto *cl_mg = arm_compute::utils::cast::polymorphic_downcast<CLMemoryGroup *>(mg);
+        cl_mg->manage(&_tensor);
+    }
+}
+
+void CLTensorHandle::map(bool blocking)
+{
+    _tensor.map(blocking);
+}
+
+void CLTensorHandle::unmap()
+{
+    _tensor.unmap();
+}
+
+void CLTensorHandle::release_if_unused()
+{
+    if(!_tensor.is_used())
+    {
+        _tensor.allocator()->free();
+    }
+}
+
+const arm_compute::ITensor &CLTensorHandle::tensor() const
+{
+    return _tensor;
+}
+
+arm_compute::ITensor &CLTensorHandle::tensor()
+{
+    return _tensor;
+}
+
+ITensorHandle *CLTensorHandle::parent_handle()
+{
+    return this;
+}
+
+bool CLTensorHandle::is_subtensor() const
+{
+    return false;
+}
+
+Target CLTensorHandle::target() const
+{
+    return Target::CL;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/backends/GLES/GCDeviceBackend.cpp b/src/graph/backends/GLES/GCDeviceBackend.cpp
new file mode 100644
index 0000000..770cca5
--- /dev/null
+++ b/src/graph/backends/GLES/GCDeviceBackend.cpp

@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/GLES/GCDeviceBackend.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/backends/BackendRegistrar.h"
+#include "arm_compute/graph/backends/GLES/GCFunctionFactory.h"
+#include "arm_compute/graph/backends/GLES/GCNodeValidator.h"
+#include "arm_compute/graph/backends/GLES/GCTensorHandle.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/BlobLifetimeManager.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryGroup.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
+#include "arm_compute/runtime/PoolManager.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+/** Register GLES backend */
+static detail::BackendRegistrar<GCDeviceBackend> GCDeviceBackend_registrar(Target::GC);
+
+GCDeviceBackend::GCDeviceBackend()
+    : _allocator()
+{
+}
+
+void GCDeviceBackend::initialize_backend()
+{
+    // Setup Scheduler
+    GCScheduler::get().default_init();
+}
+
+void GCDeviceBackend::setup_backend_context(GraphContext &ctx)
+{
+    // Setup a management backend
+    if(ctx.memory_management_ctx(Target::GC) == nullptr)
+    {
+        MemoryManagerContext mm_ctx;
+        mm_ctx.target      = Target::GC;
+        mm_ctx.intra_mm    = create_memory_manager(MemoryManagerAffinity::Buffer);
+        mm_ctx.cross_mm    = create_memory_manager(MemoryManagerAffinity::Buffer);
+        mm_ctx.cross_group = std::make_shared<GCMemoryGroup>(mm_ctx.cross_mm);
+
+        ctx.insert_memory_management_ctx(std::move(mm_ctx));
+    }
+}
+
+bool GCDeviceBackend::is_backend_supported()
+{
+    return arm_compute::opengles31_is_available();
+}
+
+IAllocator *GCDeviceBackend::backend_allocator()
+{
+    return &_allocator;
+}
+
+std::unique_ptr<ITensorHandle> GCDeviceBackend::create_tensor(const Tensor &tensor)
+{
+    // Get tensor descriptor
+    const TensorDescriptor &tensor_desc = tensor.desc();
+    ARM_COMPUTE_ERROR_ON(tensor_desc.target != Target::GC);
+
+    // Create backend tensor handle
+    TensorInfo info(tensor_desc.shape, 1, tensor_desc.data_type, tensor_desc.quant_info);
+    info.set_data_layout(tensor_desc.layout);
+    auto backend_tensor_handle = support::cpp14::make_unique<GCTensorHandle>(info);
+
+    return std::move(backend_tensor_handle);
+}
+
+std::unique_ptr<ITensorHandle> GCDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
+{
+    ARM_COMPUTE_UNUSED(parent, shape, coords, extend_parent);
+    ARM_COMPUTE_ERROR("GLES backend has no sub-tensor support!");
+    return nullptr;
+}
+
+std::unique_ptr<arm_compute::IFunction> GCDeviceBackend::configure_node(INode &node, GraphContext &ctx)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Configuring GC node with ID : " << node.id() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.assigned_target() != Target::GC);
+
+    // Configure node
+    return GCFunctionFactory::create(&node, ctx);
+}
+
+arm_compute::Status GCDeviceBackend::validate_node(INode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating GC node with ID : " << node.id() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.assigned_target() != Target::GC);
+
+    return GCNodeValidator::validate(&node);
+}
+
+std::shared_ptr<arm_compute::IMemoryManager> GCDeviceBackend::create_memory_manager(MemoryManagerAffinity affinity)
+{
+    if(affinity == MemoryManagerAffinity::Offset)
+    {
+        ARM_COMPUTE_LOG_GRAPH_WARNING("GC Backend does not support offset affinity memory management!");
+        return nullptr;
+    }
+
+    auto lifetime_mgr = std::make_shared<BlobLifetimeManager>();
+    auto pool_mgr     = std::make_shared<PoolManager>();
+    auto mm           = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
+
+    mm->set_allocator(&_allocator);
+
+    return mm;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/backends/GLES/GCFunctionsFactory.cpp b/src/graph/backends/GLES/GCFunctionsFactory.cpp
new file mode 100644
index 0000000..e61e840
--- /dev/null
+++ b/src/graph/backends/GLES/GCFunctionsFactory.cpp

@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/GLES/GCFunctionFactory.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/TypePrinter.h"
+#include "arm_compute/graph/Types.h"
+#include "arm_compute/graph/backends/Utils.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCFunctions.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute::utils::cast;
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+namespace
+{
+/** Returns backing tensor of a given tensor
+ *
+ * @param[in] tensor Tensor to extract the backing tensor from
+ *
+ * @return Backing tensor if present else nullptr
+ */
+arm_compute::IGCTensor *get_backing_tensor(arm_compute::graph::Tensor *tensor)
+{
+    arm_compute::IGCTensor *backing_tensor = nullptr;
+    if(tensor != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON(tensor->desc().target != arm_compute::graph::Target::GC);
+        // Get backing tensor handle
+        ITensorHandle *tensor_handle = tensor->handle();
+        // Get backing tensor
+        backing_tensor = (tensor_handle != nullptr) ? polymorphic_cast<IGCTensor *>(&tensor_handle->tensor()) : nullptr;
+    }
+
+    return backing_tensor;
+}
+
+/** Create a backend activation layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend activation layer function
+ */
+std::unique_ptr<IFunction> create_activation_layer(ActivationLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Creating GC ActivationLayerNode node with ID : " << node.id() << " and Name: " << node.name()
+        << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    IGCTensor                *input    = get_backing_tensor(node.input(0));
+    IGCTensor                *output   = get_backing_tensor(node.output(0));
+    const ActivationLayerInfo act_info = node.activation_info();
+
+    // Create function
+    auto func = support::cpp14::make_unique<GCActivationLayer>();
+    func->configure(input, output, act_info);
+
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated GCActivationLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Shape: " << input->info()->tensor_shape()
+                               << " Activation function: " << act_info.activation()
+                               << " a: " << act_info.a()
+                               << " b: " << act_info.b()
+                               << " InPlace : " << is_in_place_operation(input, output)
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend batch normalization layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend batch normalization layer function
+ */
+std::unique_ptr<IFunction> create_batch_normalization_layer(BatchNormalizationLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating GC BatchNormalization node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 5);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    IGCTensor                *input     = get_backing_tensor(node.input(0));
+    IGCTensor                *mean      = get_backing_tensor(node.input(1));
+    IGCTensor                *var       = get_backing_tensor(node.input(2));
+    IGCTensor                *beta      = get_backing_tensor(node.input(3));
+    IGCTensor                *gamma     = get_backing_tensor(node.input(4));
+    IGCTensor                *output    = get_backing_tensor(node.output(0));
+    const float               epsilon   = node.epsilon();
+    const ActivationLayerInfo fused_act = node.fused_activation();
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<GCBatchNormalizationLayer>();
+    func->configure(input, output, mean, var, beta, gamma, epsilon, fused_act);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated GCBatchNormalizationLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Shape: " << input->info()->tensor_shape()
+                               << " Epsilon: " << epsilon << " "
+                               << (fused_act.enabled() ? to_string(fused_act.activation()) : "")
+                               << " InPlace : " << is_in_place_operation(input, output)
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend convolution layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend convolution layer function
+ */
+std::unique_ptr<IFunction> create_convolution_layer(ConvolutionLayerNode &node, GraphContext &ctx)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating GC ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    IGCTensor *input   = get_backing_tensor(node.input(0));
+    IGCTensor *weights = get_backing_tensor(node.input(1));
+    IGCTensor *biases  = get_backing_tensor(node.input(2));
+    IGCTensor *output  = get_backing_tensor(node.output(0));
+
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()))
+    {
+        biases->info()->set_data_type(DataType::S32);
+    }
+
+    const PadStrideInfo     conv_info      = node.convolution_info();
+    const ConvolutionMethod conv_algorithm = node.convolution_method();
+
+    // Create and configure function (we assume that functions have been validated before creation)
+    std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, Target::GC);
+    std::unique_ptr<IFunction>      func;
+    std::string                     func_name;
+
+    if(conv_algorithm == ConvolutionMethod::DIRECT)
+    {
+        std::tie(func, func_name) = create_named_function<GCDirectConvolutionLayer>(
+                                        std::string("GCDirectConvolutionLayer"), input, weights, biases, output, conv_info);
+    }
+    else
+    {
+        std::tie(func, func_name) = create_named_memory_managed_function<GCConvolutionLayer>(std::string("GCConvolutionLayer"), mm,
+                                                                                             input, weights, biases, output, conv_info);
+    }
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input QuantInfo: " << input->info()->quantization_info()
+                               << " Weights QuantInfo: " << weights->info()->quantization_info()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+    return func;
+}
+
+/** Create a backend layer depth concatenate function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend depth concatenate layer function
+ */
+std::unique_ptr<arm_compute::IFunction> create_depth_concatenate_layer(DepthConcatenateLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating GC DepthConcatenate node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Return nullptr if depth concatenate is switched off
+    if(!node.is_enabled())
+    {
+        return nullptr;
+    }
+
+    // Extract IO and info
+    std::vector<arm_compute::IGCTensor *> inputs;
+    for(unsigned int i = 0; i < node.num_inputs(); ++i)
+    {
+        inputs.push_back(get_backing_tensor(node.input(i)));
+    }
+    IGCTensor *output = get_backing_tensor(node.output(0));
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<GCDepthConcatenateLayer>();
+    func->configure(inputs, output);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated GCDepthConcatenateLayer"
+                               << " Data Type: " << output->info()->data_type()
+                               << " Shape: " << output->info()->tensor_shape()
+                               << " Num Inputs: " << inputs.size()
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend layer depth-wise convolution function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend depth-wise convolution layer function
+ */
+std::unique_ptr<IFunction> create_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Creating GC DepthwiseConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name()
+        << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    IGCTensor *input   = get_backing_tensor(node.input(0));
+    IGCTensor *weights = get_backing_tensor(node.input(1));
+    IGCTensor *biases  = get_backing_tensor(node.input(2));
+    IGCTensor *output  = get_backing_tensor(node.output(0));
+
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()))
+    {
+        biases->info()->set_data_type(DataType::S32);
+    }
+
+    const PadStrideInfo              conv_info     = node.convolution_info();
+    const DepthwiseConvolutionMethod dwc_algorithm = node.depthwise_convolution_method();
+
+    // Create and configure function (we assume that functions have been validated before creation)
+    std::unique_ptr<IFunction> func;
+    std::string                func_name;
+    if(dwc_algorithm == DepthwiseConvolutionMethod::OPTIMIZED_3x3)
+    {
+        std::tie(func, func_name) = create_named_function<GCDepthwiseConvolutionLayer3x3>(
+                                        std::string("GCDepthwiseConvolutionLayer3x3"), input, weights, biases, output, conv_info);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Generic DepthwiseConvolutionLayer is not supported in GLES backend");
+    }
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input QuantInfo: " << input->info()->quantization_info()
+                               << " Weights QuantInfo: " << weights->info()->quantization_info()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+    return func;
+}
+
+/** Create a backend element-wise operation layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend element-wise operation layer function
+ */
+std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Creating GC EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 2);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    IGCTensor             *input1         = get_backing_tensor(node.input(0));
+    IGCTensor             *input2         = get_backing_tensor(node.input(1));
+    IGCTensor             *output         = get_backing_tensor(node.output(0));
+    const EltwiseOperation eltwise_op     = node.eltwise_operation();
+    const ConvertPolicy    convert_policy = node.convert_policy();
+    ARM_COMPUTE_ERROR_ON(input1 == nullptr);
+    ARM_COMPUTE_ERROR_ON(input2 == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    std::unique_ptr<IFunction> func = nullptr;
+    std::string                func_name;
+    if(eltwise_op == EltwiseOperation::ADD)
+    {
+        std::tie(func, func_name) = create_named_function<GCArithmeticAddition>(std::string("GCArithmeticAddition"),
+                                                                                input1, input2, output,
+                                                                                convert_policy);
+    }
+    else if(eltwise_op == EltwiseOperation::SUB)
+    {
+        ARM_COMPUTE_ERROR("Arithmetic subtraction is not supported in GLES backend");
+    }
+    else if(eltwise_op == EltwiseOperation::MUL)
+    {
+        std::tie(func, func_name) = create_named_function<GCPixelWiseMultiplication>(
+                                        std::string("GCPixelWiseMultiplication"), input1, input2, output, 1.f);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unsupported element-wise operation!");
+    }
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+                               << " Data Type: " << input1->info()->data_type()
+                               << " Shape : " << input1->info()->tensor_shape()
+                               << std::endl);
+
+    return func;
+}
+
+/** Create a backend fully connected layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend fully connected layer function
+ */
+std::unique_ptr<IFunction> create_fully_connected_layer(FullyConnectedLayerNode &node, GraphContext &ctx)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Creating GC FullyConnectedLayer node with ID : " << node.id() << " and Name: " << node.name()
+        << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    IGCTensor *input   = get_backing_tensor(node.input(0));
+    IGCTensor *weights = get_backing_tensor(node.input(1));
+    IGCTensor *biases  = get_backing_tensor(node.input(2));
+    IGCTensor *output  = get_backing_tensor(node.output(0));
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<GCFullyConnectedLayer>(get_memory_manager(ctx, Target::GC));
+    func->configure(input, weights, biases, output);
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(weights == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated GCFullyConnectedLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << " Biases Shape: " << biases->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend normalization layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend normalization layer function
+ */
+std::unique_ptr<IFunction> create_normalization_layer(NormalizationLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Creating GC NormalizationLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    IGCTensor                   *input     = get_backing_tensor(node.input(0));
+    IGCTensor                   *output    = get_backing_tensor(node.output(0));
+    const NormalizationLayerInfo norm_info = node.normalization_info();
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<GCNormalizationLayer>();
+    func->configure(input, output, norm_info);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated GCNormalizationLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << " Normalization info: " << norm_info.type()
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend pooling layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend pooling layer function
+ */
+std::unique_ptr<IFunction> create_pooling_layer(PoolingLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Creating GC PoolingLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    IGCTensor             *input     = get_backing_tensor(node.input(0));
+    IGCTensor             *output    = get_backing_tensor(node.output(0));
+    const PoolingLayerInfo pool_info = node.pooling_info();
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<GCPoolingLayer>();
+    func->configure(input, output, pool_info);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated GCPoolingLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << " Pooling info: " << pool_info.pool_type()
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend softmax layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend softmax layer function
+ */
+std::unique_ptr<IFunction> create_softmax_layer(SoftmaxLayerNode &node, GraphContext &ctx)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Creating GC SoftmaxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    IGCTensor *input  = get_backing_tensor(node.input(0));
+    IGCTensor *output = get_backing_tensor(node.output(0));
+    const float beta   = node.beta();
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<GCSoftmaxLayer>(get_memory_manager(ctx, Target::CL));
+    func->configure(input, output, beta);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated GCSoftmaxLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(func);
+}
+} // namespace
+
+std::unique_ptr<IFunction> GCFunctionFactory::create(INode *node, GraphContext &ctx)
+{
+    if(node == nullptr)
+    {
+        return nullptr;
+    }
+
+    NodeType type = node->type();
+    switch(type)
+    {
+        case NodeType::ActivationLayer:
+            return create_activation_layer(*polymorphic_downcast<ActivationLayerNode *>(node));
+        case NodeType::BatchNormalizationLayer:
+            return create_batch_normalization_layer(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
+        case NodeType::ConvolutionLayer:
+            return create_convolution_layer(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
+        case NodeType::DepthConcatenateLayer:
+            return create_depth_concatenate_layer(*polymorphic_downcast<DepthConcatenateLayerNode *>(node));
+        case NodeType::DepthwiseConvolutionLayer:
+            return create_depthwise_convolution_layer(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+        case NodeType::EltwiseLayer:
+            return create_eltwise_layer(*polymorphic_downcast<EltwiseLayerNode *>(node));
+        case NodeType::FullyConnectedLayer:
+            return create_fully_connected_layer(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+        case NodeType::NormalizationLayer:
+            return create_normalization_layer(*polymorphic_downcast<NormalizationLayerNode *>(node));
+        case NodeType::PoolingLayer:
+            return create_pooling_layer(*polymorphic_downcast<PoolingLayerNode *>(node));
+        case NodeType::SoftmaxLayer:
+            return create_softmax_layer(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+        default:
+            return nullptr;
+    }
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/backends/GLES/GCNodeValidator.cpp b/src/graph/backends/GLES/GCNodeValidator.cpp
new file mode 100644
index 0000000..c7f7d81
--- /dev/null
+++ b/src/graph/backends/GLES/GCNodeValidator.cpp

@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/GLES/GCNodeValidator.h"
+
+#include "arm_compute/graph/backends/ValidateHelpers.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCFunctions.h"
+
+using namespace arm_compute::utils::cast;
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+namespace
+{
+/** Validates a Depthwise Convolution layer node
+ *
+ * @param[in] node Node to validate
+ *
+ * @return Status
+ */
+Status validate_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating GCDepthwiseConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    arm_compute::ITensorInfo *weights = detail::get_backing_tensor_info(node.input(1));
+    ARM_COMPUTE_ERROR_ON(weights == nullptr);
+
+    // Validate function
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->tensor_shape().x() != 3 && weights->tensor_shape().y() != 3, "Unsupported depthwise convolution");
+    node.set_depthwise_convolution_method(DepthwiseConvolutionMethod::OPTIMIZED_3x3);
+
+    return Status{};
+}
+/** Validates a Convolution layer node
+ *
+ * @param[in] node Node to validate
+ *
+ * @return Status
+ */
+Status validate_convolution_layer(ConvolutionLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    arm_compute::ITensorInfo *weights        = detail::get_backing_tensor_info(node.input(1));
+    const PadStrideInfo       conv_info      = node.convolution_info();
+    const ConvolutionMethod   conv_algorithm = node.convolution_method();
+
+    // Validate function
+    if(conv_algorithm == ConvolutionMethod::DIRECT)
+    {
+        bool is_square         = weights->tensor_shape().x() == weights->tensor_shape().y();
+        bool is_direct         = (weights->tensor_shape().x() == 1) || (weights->tensor_shape().x() == 3) || (weights->tensor_shape().x() == 5);
+        bool is_correct_stride = (conv_info.stride().first) <= 2 && (conv_info.stride().second <= 2);
+        if(!(is_square && is_direct && is_correct_stride))
+        {
+            node.set_convolution_method(ConvolutionMethod::DEFAULT);
+        }
+    }
+
+    return Status{};
+}
+} // namespace
+
+Status GCNodeValidator::validate(INode *node)
+{
+    if(node == nullptr)
+    {
+        return Status{};
+    }
+
+    NodeType type = node->type();
+    switch(type)
+    {
+        case NodeType::ConvolutionLayer:
+            return validate_convolution_layer(*polymorphic_downcast<ConvolutionLayerNode *>(node));
+        case NodeType::DepthwiseConvolutionLayer:
+            return validate_depthwise_convolution_layer(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+        case NodeType::FlattenLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation");
+        case NodeType::ReshapeLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation");
+        default:
+            return Status{};
+    }
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/backends/GLES/GCTensorHandle.cpp b/src/graph/backends/GLES/GCTensorHandle.cpp
new file mode 100644
index 0000000..6f96263
--- /dev/null
+++ b/src/graph/backends/GLES/GCTensorHandle.cpp

@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/GLES/GCTensorHandle.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryGroup.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+GCTensorHandle::GCTensorHandle(const ITensorInfo &info)
+    : _tensor()
+{
+    _tensor.allocator()->init(info);
+}
+
+void GCTensorHandle::allocate()
+{
+    _tensor.allocator()->allocate();
+}
+
+void GCTensorHandle::free()
+{
+    _tensor.allocator()->free();
+}
+
+void GCTensorHandle::manage(IMemoryGroup *mg)
+{
+    if(mg != nullptr)
+    {
+        auto *gc_mg = arm_compute::utils::cast::polymorphic_downcast<GCMemoryGroup *>(mg);
+        gc_mg->manage(&_tensor);
+    }
+}
+
+void GCTensorHandle::map(bool blocking)
+{
+    _tensor.map(blocking);
+}
+
+void GCTensorHandle::unmap()
+{
+    _tensor.unmap();
+}
+
+void GCTensorHandle::release_if_unused()
+{
+    if(!_tensor.is_used())
+    {
+        _tensor.allocator()->free();
+    }
+}
+
+const arm_compute::ITensor &GCTensorHandle::tensor() const
+{
+    return _tensor;
+}
+
+arm_compute::ITensor &GCTensorHandle::tensor()
+{
+    return _tensor;
+}
+
+ITensorHandle *GCTensorHandle::parent_handle()
+{
+    return this;
+}
+
+bool GCTensorHandle::is_subtensor() const
+{
+    return false;
+}
+
+Target GCTensorHandle::target() const
+{
+    return Target::GC;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/backends/NEON/NEDeviceBackend.cpp b/src/graph/backends/NEON/NEDeviceBackend.cpp
new file mode 100644
index 0000000..7c2db40
--- /dev/null
+++ b/src/graph/backends/NEON/NEDeviceBackend.cpp

@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/NEON/NEDeviceBackend.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/backends/BackendRegistrar.h"
+#include "arm_compute/graph/backends/NEON/NEFunctionFactory.h"
+#include "arm_compute/graph/backends/NEON/NENodeValidator.h"
+#include "arm_compute/graph/backends/NEON/NESubTensorHandle.h"
+#include "arm_compute/graph/backends/NEON/NETensorHandle.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/Allocator.h"
+#include "arm_compute/runtime/BlobLifetimeManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
+#include "arm_compute/runtime/OffsetLifetimeManager.h"
+#include "arm_compute/runtime/PoolManager.h"
+#include "arm_compute/runtime/Scheduler.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+/** Register NEON backend */
+static detail::BackendRegistrar<NEDeviceBackend> NEDeviceBackend_registrar(Target::NEON);
+
+NEDeviceBackend::NEDeviceBackend()
+    : _allocator()
+{
+}
+
+void NEDeviceBackend::initialize_backend()
+{
+}
+
+void NEDeviceBackend::setup_backend_context(GraphContext &ctx)
+{
+    // Set number of threads
+    if(ctx.config().num_threads >= 0)
+    {
+        Scheduler::get().set_num_threads(ctx.config().num_threads);
+    }
+
+    // Create function level memory manager
+    if(ctx.memory_management_ctx(Target::NEON) == nullptr)
+    {
+        MemoryManagerContext mm_ctx;
+        mm_ctx.target      = Target::NEON;
+        mm_ctx.intra_mm    = create_memory_manager(MemoryManagerAffinity::Offset);
+        mm_ctx.cross_mm    = create_memory_manager(MemoryManagerAffinity::Offset);
+        mm_ctx.cross_group = std::make_shared<MemoryGroup>(mm_ctx.cross_mm);
+
+        ctx.insert_memory_management_ctx(std::move(mm_ctx));
+    }
+}
+
+bool NEDeviceBackend::is_backend_supported()
+{
+    return true;
+}
+
+IAllocator *NEDeviceBackend::backend_allocator()
+{
+    return &_allocator;
+}
+
+std::unique_ptr<ITensorHandle> NEDeviceBackend::create_tensor(const Tensor &tensor)
+{
+    // Get tensor descriptor
+    const TensorDescriptor &tensor_desc = tensor.desc();
+    ARM_COMPUTE_ERROR_ON(tensor_desc.target != Target::NEON);
+
+    // Create backend tensor handle
+    TensorInfo info(tensor_desc.shape, 1, tensor_desc.data_type, tensor_desc.quant_info);
+    info.set_data_layout(tensor_desc.layout);
+    auto backend_tensor_handle = support::cpp14::make_unique<NETensorHandle>(info);
+
+    return std::move(backend_tensor_handle);
+}
+
+std::unique_ptr<ITensorHandle> NEDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
+{
+    if(parent == nullptr)
+    {
+        return nullptr;
+    }
+
+    return support::cpp14::make_unique<NESubTensorHandle>(parent, shape, coords, extend_parent);
+}
+
+std::unique_ptr<arm_compute::IFunction> NEDeviceBackend::configure_node(INode &node, GraphContext &ctx)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Configuring NEON node with ID : " << node.id() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.assigned_target() != Target::NEON);
+
+    // Configure node
+    return NEFunctionFactory::create(&node, ctx);
+}
+
+arm_compute::Status NEDeviceBackend::validate_node(INode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating NEON node with ID : " << node.id() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.assigned_target() != Target::NEON);
+
+    return NENodeValidator::validate(&node);
+}
+
+std::shared_ptr<arm_compute::IMemoryManager> NEDeviceBackend::create_memory_manager(MemoryManagerAffinity affinity)
+{
+    std::shared_ptr<ILifetimeManager> lifetime_mgr = nullptr;
+    if(affinity == MemoryManagerAffinity::Buffer)
+    {
+        lifetime_mgr = std::make_shared<BlobLifetimeManager>();
+    }
+    else
+    {
+        lifetime_mgr = std::make_shared<OffsetLifetimeManager>();
+    }
+    auto pool_mgr = std::make_shared<PoolManager>();
+    auto mm       = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
+
+    mm->set_allocator(&_allocator);
+
+    return mm;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
new file mode 100644
index 0000000..7b1c50f
--- /dev/null
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp

@@ -0,0 +1,579 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/NEON/NEFunctionFactory.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/TypePrinter.h"
+#include "arm_compute/graph/backends/Utils.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/runtime/NEON/NEFunctions.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute::utils::cast;
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+namespace
+{
+/** Returns backing tensor of a given tensor
+ *
+ * @param[in] tensor Tensor to extract the backing tensor from
+ *
+ * @return Backing tensor if present else nullptr
+ */
+arm_compute::ITensor *get_backing_tensor(arm_compute::graph::Tensor *tensor)
+{
+    return ((tensor == nullptr) || (tensor->handle() == nullptr)) ? nullptr : &tensor->handle()->tensor();
+}
+
+/** Create a backend activation layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend activation layer function
+ */
+std::unique_ptr<IFunction> create_activation_layer(ActivationLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON ActivationLayerNode node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ITensor                  *input    = get_backing_tensor(node.input(0));
+    ITensor                  *output   = get_backing_tensor(node.output(0));
+    const ActivationLayerInfo act_info = node.activation_info();
+
+    // Create function
+    auto func = support::cpp14::make_unique<NEActivationLayer>();
+    func->configure(input, output, act_info);
+
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NEActivationLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Shape: " << input->info()->tensor_shape()
+                               << " Activation function: " << act_info.activation()
+                               << " a: " << act_info.a()
+                               << " b: " << act_info.b()
+                               << " InPlace : " << is_in_place_operation(input, output)
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend batch normalization layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend batch normalization layer function
+ */
+std::unique_ptr<IFunction> create_batch_normalization_layer(BatchNormalizationLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON BatchNormalization node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 5);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ITensor                  *input     = get_backing_tensor(node.input(0));
+    ITensor                  *mean      = get_backing_tensor(node.input(1));
+    ITensor                  *var       = get_backing_tensor(node.input(2));
+    ITensor                  *beta      = get_backing_tensor(node.input(3));
+    ITensor                  *gamma     = get_backing_tensor(node.input(4));
+    ITensor                  *output    = get_backing_tensor(node.output(0));
+    const float               epsilon   = node.epsilon();
+    const ActivationLayerInfo fused_act = node.fused_activation();
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<NEBatchNormalizationLayer>();
+    func->configure(input, output, mean, var, beta, gamma, epsilon, fused_act);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NEBatchNormalizationLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Shape: " << input->info()->tensor_shape()
+                               << " Epsilon: " << epsilon << " "
+                               << (fused_act.enabled() ? to_string(fused_act.activation()) : "")
+                               << " InPlace : " << is_in_place_operation(input, output)
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend convolution layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend convolution layer function
+ */
+std::unique_ptr<IFunction> create_convolution_layer(ConvolutionLayerNode &node, GraphContext &ctx)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ITensor *input   = get_backing_tensor(node.input(0));
+    ITensor *weights = get_backing_tensor(node.input(1));
+    ITensor *biases  = get_backing_tensor(node.input(2));
+    ITensor *output  = get_backing_tensor(node.output(0));
+
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()))
+    {
+        biases->info()->set_data_type(DataType::S32);
+    }
+
+    const PadStrideInfo     conv_info      = node.convolution_info();
+    const ConvolutionMethod conv_algorithm = node.convolution_method();
+
+    // Create and configure function (we assume that functions have been validated before creation)
+    std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, Target::NEON);
+    std::unique_ptr<IFunction>      func;
+    std::string                     func_name;
+    if(conv_algorithm == ConvolutionMethod::DIRECT)
+    {
+        std::tie(func, func_name) = create_named_memory_managed_function<NEDirectConvolutionLayer>(std::string("NEDirectConvolutionLayer"), mm,
+                                                                                                   input, weights, biases, output, conv_info);
+    }
+    else if(conv_algorithm == ConvolutionMethod::GEMM)
+    {
+        std::tie(func, func_name) = create_named_memory_managed_function<NEGEMMConvolutionLayer>(std::string("NEGEMMConvolutionLayer"), mm,
+                                                                                                 input, weights, biases, output, conv_info);
+    }
+    else if(conv_algorithm == ConvolutionMethod::WINOGRAD)
+    {
+        std::tie(func, func_name) = create_named_memory_managed_function<NEWinogradConvolutionLayer>(std::string("NEWinogradConvolutionLayer"), mm,
+                                                                                                     input, weights, biases, output, conv_info);
+    }
+    else
+    {
+        std::tie(func, func_name) = create_named_memory_managed_function<NEConvolutionLayer>(std::string("NEConvolutionLayer"), mm,
+                                                                                             input, weights, biases, output, conv_info);
+    }
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input QuantInfo: " << input->info()->quantization_info()
+                               << " Weights QuantInfo: " << weights->info()->quantization_info()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+    return func;
+}
+
+/** Create a backend layer depth concatenate function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend depth concatenate layer function
+ */
+std::unique_ptr<arm_compute::IFunction> create_depth_concatenate_layer(DepthConcatenateLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON DepthConcatenate node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Return nullptr if depth concatenate is switched off
+    if(!node.is_enabled())
+    {
+        return nullptr;
+    }
+
+    // Extract IO and info
+    std::vector<arm_compute::ITensor *> inputs;
+    for(unsigned int i = 0; i < node.num_inputs(); ++i)
+    {
+        inputs.push_back(get_backing_tensor(node.input(i)));
+    }
+    ITensor *output = get_backing_tensor(node.output(0));
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<NEDepthConcatenateLayer>();
+    func->configure(inputs, output);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NEDepthConcatenateLayer"
+                               << " Data Type: " << output->info()->data_type()
+                               << " Shape: " << output->info()->tensor_shape()
+                               << " Num Inputs: " << inputs.size()
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend layer depth-wise convolution function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend depth-wise convolution layer function
+ */
+std::unique_ptr<IFunction> create_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON DepthwiseConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ITensor *input   = get_backing_tensor(node.input(0));
+    ITensor *weights = get_backing_tensor(node.input(1));
+    ITensor *biases  = get_backing_tensor(node.input(2));
+    ITensor *output  = get_backing_tensor(node.output(0));
+
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()))
+    {
+        biases->info()->set_data_type(DataType::S32);
+    }
+
+    const PadStrideInfo              conv_info     = node.convolution_info();
+    const DepthwiseConvolutionMethod dwc_algorithm = node.depthwise_convolution_method();
+
+    // Create and configure function (we assume that functions have been validated before creation)
+    std::unique_ptr<IFunction> func;
+    std::string                func_name;
+    if(dwc_algorithm == DepthwiseConvolutionMethod::OPTIMIZED_3x3)
+    {
+        std::tie(func, func_name) = create_named_function<NEDepthwiseConvolutionLayer3x3>(std::string("NEDepthwiseConvolutionLayer3x3"),
+                                                                                          input, weights, biases, output, conv_info);
+    }
+    else
+    {
+        std::tie(func, func_name) = create_named_function<NEDepthwiseConvolutionLayer>(std::string("NEDepthwiseConvolutionLayer"),
+                                                                                       input, weights, biases, output, conv_info);
+    }
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input QuantInfo: " << input->info()->quantization_info()
+                               << " Weights QuantInfo: " << weights->info()->quantization_info()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+    return func;
+}
+
+/** Create a backend element-wise operation layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend element-wise operation layer function
+ */
+std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 2);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ITensor               *input1         = get_backing_tensor(node.input(0));
+    ITensor               *input2         = get_backing_tensor(node.input(1));
+    ITensor               *output         = get_backing_tensor(node.output(0));
+    const EltwiseOperation eltwise_op     = node.eltwise_operation();
+    const ConvertPolicy    convert_policy = node.convert_policy();
+    ARM_COMPUTE_ERROR_ON(input1 == nullptr);
+    ARM_COMPUTE_ERROR_ON(input2 == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    std::unique_ptr<IFunction> func = nullptr;
+    std::string                func_name;
+    if(eltwise_op == EltwiseOperation::ADD)
+    {
+        std::tie(func, func_name) = create_named_function<NEArithmeticAddition>(std::string("NEArithmeticAddition"),
+                                                                                input1, input2, output, convert_policy);
+    }
+    else if(eltwise_op == EltwiseOperation::SUB)
+    {
+        std::tie(func, func_name) = create_named_function<NEArithmeticSubtraction>(std::string("NEArithmeticSubtraction"),
+                                                                                   input1, input2, output, convert_policy);
+    }
+    else if(eltwise_op == EltwiseOperation::MUL)
+    {
+        std::tie(func, func_name) = create_named_function<NEPixelWiseMultiplication>(std::string("NEPixelWiseMultiplication"),
+                                                                                     input1, input2, output, 1.f,
+                                                                                     convert_policy, node.rounding_policy());
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unsupported element-wise operation!");
+    }
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+                               << " Data Type: " << input1->info()->data_type()
+                               << " Shape : " << input1->info()->tensor_shape()
+                               << std::endl);
+
+    return func;
+}
+
+/** Create a backend flatten layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend flatten layer function
+ */
+std::unique_ptr<IFunction> create_flatten_layer(FlattenLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON FlattenLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ITensor *input  = get_backing_tensor(node.input(0));
+    ITensor *output = get_backing_tensor(node.output(0));
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<NEFlattenLayer>();
+    func->configure(input, output);
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NEFlattenLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend fully connected layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend fully connected layer function
+ */
+std::unique_ptr<IFunction> create_fully_connected_layer(FullyConnectedLayerNode &node, GraphContext &ctx)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON FullyConnectedLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 3);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ITensor *input   = get_backing_tensor(node.input(0));
+    ITensor *weights = get_backing_tensor(node.input(1));
+    ITensor *biases  = get_backing_tensor(node.input(2));
+    ITensor *output  = get_backing_tensor(node.output(0));
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<NEFullyConnectedLayer>(get_memory_manager(ctx, Target::NEON));
+    func->configure(input, weights, biases, output);
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(weights == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NEFullyConnectedLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend normalization layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend normalization layer function
+ */
+std::unique_ptr<IFunction> create_normalization_layer(NormalizationLayerNode &node, GraphContext &ctx)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON NormalizationLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ITensor                     *input     = get_backing_tensor(node.input(0));
+    ITensor                     *output    = get_backing_tensor(node.output(0));
+    const NormalizationLayerInfo norm_info = node.normalization_info();
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<NENormalizationLayer>(get_memory_manager(ctx, Target::NEON));
+    func->configure(input, output, norm_info);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NENormalizationLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << " Normalization info: " << norm_info.type()
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend pooling layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend pooling layer function
+ */
+std::unique_ptr<IFunction> create_pooling_layer(PoolingLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON PoolingLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ITensor               *input     = get_backing_tensor(node.input(0));
+    ITensor               *output    = get_backing_tensor(node.output(0));
+    const PoolingLayerInfo pool_info = node.pooling_info();
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<NEPoolingLayer>();
+    func->configure(input, output, pool_info);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NEPoolingLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << " Pooling info: " << pool_info.pool_type()
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend reshape layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend reshape layer function
+ */
+std::unique_ptr<IFunction> create_reshape_layer(ReshapeLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON ReshapeLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ITensor *input  = get_backing_tensor(node.input(0));
+    ITensor *output = get_backing_tensor(node.output(0));
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<NEReshapeLayer>();
+    func->configure(input, output);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NEReshapeLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(func);
+}
+
+/** Create a backend softmax layer function
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend softmax layer function
+ */
+std::unique_ptr<IFunction> create_softmax_layer(SoftmaxLayerNode &node, GraphContext &ctx)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating NEON SoftmaxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    ITensor    *input  = get_backing_tensor(node.input(0));
+    ITensor    *output = get_backing_tensor(node.output(0));
+    const float beta   = node.beta();
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<NESoftmaxLayer>(get_memory_manager(ctx, Target::NEON));
+    func->configure(input, output, beta);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated NESoftmaxLayer"
+                               << " Data Type: " << input->info()->data_type()
+                               << " Input shape: " << input->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(func);
+}
+} // namespace
+
+std::unique_ptr<IFunction> NEFunctionFactory::create(INode *node, GraphContext &ctx)
+{
+    if(node == nullptr)
+    {
+        return nullptr;
+    }
+
+    NodeType type = node->type();
+    switch(type)
+    {
+        case NodeType::ActivationLayer:
+            return create_activation_layer(*polymorphic_downcast<ActivationLayerNode *>(node));
+        case NodeType::BatchNormalizationLayer:
+            return create_batch_normalization_layer(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
+        case NodeType::ConvolutionLayer:
+            return create_convolution_layer(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
+        case NodeType::DepthConcatenateLayer:
+            return create_depth_concatenate_layer(*polymorphic_downcast<DepthConcatenateLayerNode *>(node));
+        case NodeType::DepthwiseConvolutionLayer:
+            return create_depthwise_convolution_layer(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+        case NodeType::EltwiseLayer:
+            return create_eltwise_layer(*polymorphic_downcast<EltwiseLayerNode *>(node));
+        case NodeType::FlattenLayer:
+            return create_flatten_layer(*polymorphic_downcast<FlattenLayerNode *>(node));
+        case NodeType::FullyConnectedLayer:
+            return create_fully_connected_layer(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+        case NodeType::NormalizationLayer:
+            return create_normalization_layer(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
+        case NodeType::PoolingLayer:
+            return create_pooling_layer(*polymorphic_downcast<PoolingLayerNode *>(node));
+        case NodeType::ReshapeLayer:
+            return create_reshape_layer(*polymorphic_downcast<ReshapeLayerNode *>(node));
+        case NodeType::SoftmaxLayer:
+            return create_softmax_layer(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+        default:
+            return nullptr;
+    }
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp
new file mode 100644
index 0000000..e438e79
--- /dev/null
+++ b/src/graph/backends/NEON/NENodeValidator.cpp

@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/NEON/NENodeValidator.h"
+
+#include "arm_compute/graph/backends/ValidateHelpers.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/runtime/NEON/NEFunctions.h"
+
+using namespace arm_compute::utils::cast;
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+Status NENodeValidator::validate(INode *node)
+{
+    if(node == nullptr)
+    {
+        return Status{};
+    }
+
+    NodeType type = node->type();
+    switch(type)
+    {
+        case NodeType::ConvolutionLayer:
+            return detail::validate_convolution_layer<NEConvolutionLayer,
+                   NEDirectConvolutionLayer,
+                   NEGEMMConvolutionLayer,
+                   NEWinogradConvolutionLayer>(*polymorphic_downcast<ConvolutionLayerNode *>(node));
+        case NodeType::DepthwiseConvolutionLayer:
+            return detail::validate_depthwise_convolution_layer<NEDepthwiseConvolutionLayer,
+                   NEDepthwiseConvolutionLayer3x3>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+
+        default:
+            return Status{};
+    }
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/backends/NEON/NESubTensorHandle.cpp b/src/graph/backends/NEON/NESubTensorHandle.cpp
new file mode 100644
index 0000000..c0acedd
--- /dev/null
+++ b/src/graph/backends/NEON/NESubTensorHandle.cpp

@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/NEON/NESubTensorHandle.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+NESubTensorHandle::NESubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent)
+    : _sub_tensor(), _parent_handle(nullptr)
+{
+    ARM_COMPUTE_ERROR_ON(!parent_handle);
+    _sub_tensor    = arm_compute::SubTensor(&parent_handle->tensor(), shape, coords, extend_parent);
+    _parent_handle = parent_handle;
+}
+
+void NESubTensorHandle::allocate()
+{
+    // noop
+}
+
+void NESubTensorHandle::free()
+{
+    // noop
+}
+
+void NESubTensorHandle::manage(IMemoryGroup *mg)
+{
+    ARM_COMPUTE_UNUSED(mg);
+    // noop
+}
+
+void NESubTensorHandle::map(bool blocking)
+{
+    ARM_COMPUTE_UNUSED(blocking);
+}
+
+void NESubTensorHandle::unmap()
+{
+    // noop
+}
+
+void NESubTensorHandle::release_if_unused()
+{
+    // noop
+}
+
+const arm_compute::ITensor &NESubTensorHandle::tensor() const
+{
+    return _sub_tensor;
+}
+
+arm_compute::ITensor &NESubTensorHandle::tensor()
+{
+    return _sub_tensor;
+}
+
+ITensorHandle *NESubTensorHandle::parent_handle()
+{
+    ARM_COMPUTE_ERROR_ON(_parent_handle == nullptr);
+    return _parent_handle->parent_handle();
+}
+
+bool NESubTensorHandle::is_subtensor() const
+{
+    return true;
+}
+
+Target NESubTensorHandle::target() const
+{
+    return Target::NEON;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/backends/NEON/NETensorHandle.cpp b/src/graph/backends/NEON/NETensorHandle.cpp
new file mode 100644
index 0000000..caa2c10
--- /dev/null
+++ b/src/graph/backends/NEON/NETensorHandle.cpp

@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/backends/NEON/NETensorHandle.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace backends
+{
+NETensorHandle::NETensorHandle(const ITensorInfo &info)
+    : _tensor()
+{
+    _tensor.allocator()->init(info);
+}
+
+void NETensorHandle::allocate()
+{
+    _tensor.allocator()->allocate();
+}
+
+void NETensorHandle::free()
+{
+    _tensor.allocator()->free();
+}
+
+void NETensorHandle::manage(IMemoryGroup *mg)
+{
+    if(mg != nullptr)
+    {
+        auto *ne_mg = arm_compute::utils::cast::polymorphic_downcast<MemoryGroup *>(mg);
+        ne_mg->manage(&_tensor);
+    }
+}
+
+void NETensorHandle::map(bool blocking)
+{
+    ARM_COMPUTE_UNUSED(blocking);
+}
+
+void NETensorHandle::unmap()
+{
+}
+
+void NETensorHandle::release_if_unused()
+{
+    if(!_tensor.is_used())
+    {
+        _tensor.allocator()->free();
+    }
+}
+
+const arm_compute::ITensor &NETensorHandle::tensor() const
+{
+    return _tensor;
+}
+
+arm_compute::ITensor &NETensorHandle::tensor()
+{
+    return _tensor;
+}
+
+ITensorHandle *NETensorHandle::parent_handle()
+{
+    return this;
+}
+
+bool NETensorHandle::is_subtensor() const
+{
+    return false;
+}
+
+Target NETensorHandle::target() const
+{
+    return Target::NEON;
+}
+} // namespace backends
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
new file mode 100644
index 0000000..6b2f68c
--- /dev/null
+++ b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp

@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/GraphManager.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/Types.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/utils/misc/Cast.h"
+
+#include <algorithm>
+#include <map>
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace detail
+{
+namespace
+{
+using HandleCountPair     = std::pair<ITensorHandle *, unsigned int>;
+using HandleCounter       = std::map<HandleCountPair::first_type, HandleCountPair::second_type>;
+using TargetHandleCounter = std::map<Target, HandleCounter>;
+
+/** Holds managed IO tensor handles if a task */
+struct TaskHandles
+{
+    std::vector<std::pair<ITensorHandle *, IMemoryGroup *>> input_handles  = {}; /**< Input handles to a task */
+    std::vector<std::pair<ITensorHandle *, IMemoryGroup *>> output_handles = {}; /**< Output handles of a task */
+};
+
+/** Returns memory group depending on handle backend type
+ *
+ * @param[in] ctx    Graph context
+ * @param[in] handle Tensor handle
+ *
+ * @return Memory groupb
+ */
+IMemoryGroup *get_memory_group_from_handle(GraphContext &ctx, ITensorHandle *handle)
+{
+    ARM_COMPUTE_ERROR_ON(handle == nullptr);
+    return ctx.memory_management_ctx(handle->target())->cross_group.get();
+}
+
+/** Get handles of const tensors of graph
+ *
+ * @param[in] g Graph
+ *
+ * @return Handles of const tensors of graph
+ */
+std::set<ITensorHandle *> get_const_handles(const Graph &g)
+{
+    std::set<NodeType> const_node_types = { NodeType::Input, NodeType::Output, NodeType::Const };
+
+    std::set<ITensorHandle *> const_tensors;
+
+    auto &nodes = g.nodes();
+    for(auto &node : nodes)
+    {
+        // If its a const node:
+        if(node != nullptr && const_node_types.find(node->type()) != std::end(const_node_types))
+        {
+            // Add all its inputs / outputs to the list of constant handles
+            for(unsigned int i = 0; i < node->num_inputs(); ++i)
+            {
+                if(node->input(i) != nullptr)
+                {
+                    const_tensors.insert(node->input(i)->handle()->parent_handle());
+                }
+            }
+            for(unsigned int i = 0; i < node->num_outputs(); ++i)
+            {
+                if(node->output(i) != nullptr)
+                {
+                    const_tensors.insert(node->output(i)->handle()->parent_handle());
+                }
+            }
+        }
+    }
+
+    return const_tensors;
+}
+
+/** Builds a list of all the transition handles (Handles that are used to link two nodes)
+ *
+ * @param[in] ctx           Graph context
+ * @param[in] task          Workload task
+ * @param[in] const_tensors Constant tensors
+ *
+ * @return List of transition handles
+ */
+TaskHandles get_transition_handles(GraphContext                    &ctx,
+                                   ExecutionTask                   &task,
+                                   const std::set<ITensorHandle *> &const_tensors)
+{
+    ARM_COMPUTE_ERROR_ON(task.node == nullptr || task.task == nullptr);
+    INode &node = *task.node;
+
+    TaskHandles transition_handles;
+
+    // Add input handles
+    for(unsigned int i = 0; i < node.input_edges().size(); ++i)
+    {
+        Edge *input_edge = node.input_edge(i);
+        // If this input is the output of another node
+        if(input_edge != nullptr && input_edge->tensor() != nullptr && const_tensors.find(input_edge->tensor()->handle()->parent_handle()) == std::end(const_tensors))
+        {
+            // Then add it to the list of transition buffers
+            ITensorHandle *tensor_handle = input_edge->tensor()->handle()->parent_handle();
+            IMemoryGroup *mm_group      = get_memory_group_from_handle(ctx, tensor_handle);
+            transition_handles.input_handles.push_back(std::make_pair(tensor_handle, mm_group));
+        }
+    }
+
+    // Add output handles
+    for(unsigned int i = 0; i < node.num_outputs(); ++i)
+    {
+        Tensor *output_tensor = node.output(i);
+        // If this output is used as an input for another node
+        if(output_tensor != nullptr && const_tensors.find(output_tensor->handle()->parent_handle()) == std::end(const_tensors))
+        {
+            ITensorHandle *tensor_handle = output_tensor->handle()->parent_handle();
+            IMemoryGroup *mm_group      = get_memory_group_from_handle(ctx, tensor_handle);
+            transition_handles.output_handles.push_back(std::make_pair(tensor_handle, mm_group));
+        }
+    }
+
+    return transition_handles;
+}
+
+/** Counts handles refcount for each input handle of each target
+ *
+ * @param[in]     task           Execution task containing the managed handles
+ * @param[in,out] handle_counter Data structure that keeps the handles reference count
+ */
+void count_input_handles_per_target(const TaskHandles &task_handles, TargetHandleCounter &handle_counter)
+{
+    for(const auto &handle : task_handles.input_handles)
+    {
+        ITensorHandle *key            = handle.first;
+        HandleCounter &target_counter = handle_counter[key->target()];
+        if(target_counter.find(key) == std::end(target_counter))
+        {
+            target_counter.emplace(std::make_pair(key, 1));
+        }
+        else
+        {
+            ++target_counter[key];
+        }
+    }
+}
+
+/** Calculates the lifetime of each tensor handle
+ *
+ * @param[in, out] tasks_handles Tensor handles for each task
+ * @param[in]      hc            Data structure that keeps the handles reference count
+ */
+void configure_handle_lifetime(std::vector<TaskHandles> &tasks_handles, const HandleCounter &hc)
+{
+    // Identify max number of tensors in flight
+    HandleCounter tensors_in_flight;
+
+    // Acquires the given handles and sets them as in flight if they aren't already
+    auto acquire = [&](std::vector<std::pair<ITensorHandle *, IMemoryGroup *>> &handles)
+    {
+        for(auto &handle : handles)
+        {
+            ITensorHandle *parent_handle = handle.first;
+            ARM_COMPUTE_ERROR_ON(parent_handle == nullptr);
+            // If the tensor is not already in flight:
+            if(tensors_in_flight.find(parent_handle) == std::end(tensors_in_flight))
+            {
+                ARM_COMPUTE_ERROR_ON(hc.find(parent_handle) == std::end(hc));
+                // Then add it to the list of in flight tensors
+                tensors_in_flight.insert(std::make_pair(parent_handle, hc.at(parent_handle)));
+                // Start of allocation's lifetime
+                parent_handle->manage(handle.second);
+            }
+        }
+    };
+
+    for(auto &task_handle : tasks_handles)
+    {
+        // Marking all the input and output tensors of the task as in flight
+        acquire(task_handle.input_handles);
+        acquire(task_handle.output_handles);
+
+        // Releasing the input tensors
+        for(auto &input_handle : task_handle.input_handles)
+        {
+            ITensorHandle *ihandle = input_handle.first;
+            ARM_COMPUTE_ERROR_ON(ihandle == nullptr);
+            ARM_COMPUTE_ERROR_ON(tensors_in_flight.find(ihandle) == std::end(tensors_in_flight));
+            --tensors_in_flight[ihandle];
+            if(tensors_in_flight[ihandle] <= 0)
+            {
+                // Remove tensor for tensors in flight
+                tensors_in_flight.erase(ihandle);
+                // End of allocation's lifetime
+                ihandle->allocate();
+            }
+        }
+    }
+}
+} // namespace
+
+void configure_transition_manager(Graph &g, GraphContext &ctx, ExecutionWorkload &workload)
+{
+    // Get const tensors (un-managed)
+    std::set<ITensorHandle *> const_tensors = get_const_handles(g);
+
+    std::vector<TaskHandles> tasks_handles;
+    TargetHandleCounter      target_handle_count;
+
+    // Count handles
+    for(auto &task : workload.tasks)
+    {
+        // Populates IO handles
+        tasks_handles.push_back(get_transition_handles(ctx, task, const_tensors));
+
+        // Count handles
+        count_input_handles_per_target(tasks_handles.back(), target_handle_count);
+    }
+
+    // Setup memory managers
+    for(auto &hc : target_handle_count)
+    {
+        MemoryManagerContext *mm_ctx = ctx.memory_management_ctx(hc.first);
+        if(mm_ctx != nullptr)
+        {
+            if(mm_ctx->cross_mm != nullptr && mm_ctx->cross_group != nullptr)
+            {
+                // Manage and allocate tensors
+                configure_handle_lifetime(tasks_handles, hc.second);
+            }
+        }
+    }
+}
+} // namespace detail
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/detail/ExecutionHelpers.cpp b/src/graph/detail/ExecutionHelpers.cpp
new file mode 100644
index 0000000..c370fdf
--- /dev/null
+++ b/src/graph/detail/ExecutionHelpers.cpp

@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/detail/ExecutionHelpers.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/GraphManager.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace detail
+{
+void default_initialize_backends()
+{
+    for(const auto &backend : backends::BackendRegistry::get().backends())
+    {
+        backend.second->initialize_backend();
+    }
+}
+
+void validate_all_nodes(Graph &g)
+{
+    auto &nodes = g.nodes();
+
+    // Create tasks
+    for(auto &node : nodes)
+    {
+        if(node != nullptr)
+        {
+            Target assigned_target = node->assigned_target();
+            auto   backend         = backends::BackendRegistry::get().find_backend(assigned_target);
+            ARM_COMPUTE_ERROR_ON_MSG(!backend, "Requested backend doesn't exist!");
+            Status status = backend->validate_node(*node);
+            ARM_COMPUTE_ERROR_ON_MSG(!bool(status), status.error_description().c_str());
+        }
+    }
+}
+
+void configure_all_tensors(Graph &g)
+{
+    auto &tensors = g.tensors();
+
+    for(auto &tensor : tensors)
+    {
+        if(tensor)
+        {
+            Target target  = tensor->desc().target;
+            auto   backend = backends::BackendRegistry::get().find_backend(target);
+            ARM_COMPUTE_ERROR_ON_MSG(!backend, "Requested backend doesn't exist!");
+            auto handle = backend->create_tensor(*tensor);
+            ARM_COMPUTE_ERROR_ON_MSG(!backend, "Couldn't create backend handle!");
+            tensor->set_handle(std::move(handle));
+        }
+    }
+}
+
+void allocate_all_input_tensors(INode &node)
+{
+    for(unsigned int i = 0; i < node.num_inputs(); ++i)
+    {
+        Tensor *tensor = node.input(i);
+        if(tensor != nullptr && !tensor->bound_edges().empty())
+        {
+            ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!");
+            tensor->handle()->allocate();
+        }
+    }
+}
+
+void allocate_all_output_tensors(INode &node)
+{
+    for(unsigned int i = 0; i < node.num_outputs(); ++i)
+    {
+        Tensor *tensor = node.output(i);
+        if(tensor != nullptr && !tensor->bound_edges().empty())
+        {
+            ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!");
+            tensor->handle()->allocate();
+        }
+    }
+}
+
+void allocate_const_tensors(Graph &g)
+{
+    for(auto &node : g.nodes())
+    {
+        if(node != nullptr)
+        {
+            switch(node->type())
+            {
+                case NodeType::Const:
+                case NodeType::Input:
+                    allocate_all_output_tensors(*node);
+                    break;
+                case NodeType::Output:
+                    allocate_all_input_tensors(*node);
+                default:
+                    break;
+            }
+        }
+    }
+}
+
+void allocate_all_tensors(Graph &g)
+{
+    auto &tensors = g.tensors();
+
+    for(auto &tensor : tensors)
+    {
+        if(tensor && !tensor->bound_edges().empty() && tensor->handle() != nullptr && tensor->handle()->tensor().info()->is_resizable() && tensor->handle()->tensor().is_used())
+        {
+            tensor->handle()->allocate();
+        }
+    }
+}
+
+ExecutionWorkload configure_all_nodes(Graph &g, GraphContext &ctx)
+{
+    ExecutionWorkload workload;
+    workload.graph = &g;
+    workload.ctx   = &ctx;
+
+    auto &nodes = g.nodes();
+
+    // Create tasks
+    for(auto &node : nodes)
+    {
+        if(node != nullptr)
+        {
+            Target assigned_target = node->assigned_target();
+            auto   backend         = backends::BackendRegistry::get().find_backend(assigned_target);
+            ARM_COMPUTE_ERROR_ON_MSG(!backend, "Requested backend doesn't exist!");
+            auto func = backend->configure_node(*node, ctx);
+            if(func != nullptr)
+            {
+                ExecutionTask task;
+                task.task = std::move(func);
+                task.node = node.get();
+                workload.tasks.push_back(std::move(task));
+            }
+        }
+    }
+
+    // Add inputs and outputs
+    for(auto &node : nodes)
+    {
+        if(node != nullptr && node->type() == NodeType::Input)
+        {
+            workload.inputs.push_back(node->output(0));
+        }
+
+        if(node != nullptr && node->type() == NodeType::Output)
+        {
+            workload.outputs.push_back(node->input(0));
+            continue;
+        }
+    }
+
+    return workload;
+}
+
+void release_unused_tensors(Graph &g)
+{
+    for(auto &tensor : g.tensors())
+    {
+        if(tensor != nullptr && tensor->handle() != nullptr)
+        {
+            tensor->handle()->release_if_unused();
+        }
+    }
+}
+
+void call_tensor_accessor(Tensor *tensor)
+{
+    ARM_COMPUTE_ERROR_ON(!tensor);
+    tensor->call_accessor();
+}
+
+void call_all_const_node_accessors(Graph &g)
+{
+    auto &nodes = g.nodes();
+
+    for(auto &node : nodes)
+    {
+        if(node != nullptr && node->type() == NodeType::Const)
+        {
+            call_tensor_accessor(node->output(0));
+        }
+    }
+}
+
+void call_all_input_node_accessors(ExecutionWorkload &workload)
+{
+    for(auto &input : workload.inputs)
+    {
+        if(input != nullptr)
+        {
+            input->call_accessor();
+        }
+    }
+}
+
+void prepare_all_tasks(ExecutionWorkload &workload)
+{
+    ARM_COMPUTE_ERROR_ON(workload.graph == nullptr);
+    for(auto &task : workload.tasks)
+    {
+        task.prepare();
+        release_unused_tensors(*workload.graph);
+    }
+}
+
+void call_all_tasks(ExecutionWorkload &workload)
+{
+    ARM_COMPUTE_ERROR_ON(workload.ctx == nullptr);
+
+    // Acquire memory for the transition buffers
+    for(auto &mm_ctx : workload.ctx->memory_managers())
+    {
+        if(mm_ctx.second.cross_group != nullptr)
+        {
+            mm_ctx.second.cross_group->acquire();
+        }
+    }
+
+    // Execute tasks
+    for(auto &task : workload.tasks)
+    {
+        task();
+    }
+
+    // Release memory for the transition buffers
+    for(auto &mm_ctx : workload.ctx->memory_managers())
+    {
+        if(mm_ctx.second.cross_group != nullptr)
+        {
+            mm_ctx.second.cross_group->release();
+        }
+    }
+}
+
+void call_all_output_node_accessors(ExecutionWorkload &workload)
+{
+    for(auto &output : workload.outputs)
+    {
+        if(output != nullptr)
+        {
+            output->call_accessor();
+        }
+    }
+}
+} // namespace detail
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/frontend/Stream.cpp b/src/graph/frontend/Stream.cpp
new file mode 100644
index 0000000..96a166c
--- /dev/null
+++ b/src/graph/frontend/Stream.cpp

@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/frontend/Stream.h"
+
+#include "arm_compute/graph/Utils.h"
+#include "arm_compute/graph/frontend/ILayer.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace frontend
+{
+Stream::Stream(size_t id, std::string name)
+    : _manager(), _ctx(), _g(id, std::move(name))
+{
+}
+
+void Stream::finalize(Target target, const GraphConfig &config)
+{
+    PassManager pm = create_default_pass_manager(target);
+    _ctx.set_config(config);
+    _manager.finalize_graph(_g, _ctx, pm, target);
+}
+
+void Stream::run()
+{
+    _manager.execute_graph(_g);
+}
+
+void Stream::add_layer(ILayer &layer)
+{
+    auto nid   = layer.create_layer(*this);
+    _tail_node = nid;
+}
+
+const Graph &Stream::graph() const
+{
+    return _g;
+}
+
+Graph &Stream::graph()
+{
+    return _g;
+}
+} // namespace frontend
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/CL/CLMap.cpp b/src/graph/frontend/SubStream.cpp
similarity index 63%
copy from src/graph/CL/CLMap.cpp
copy to src/graph/frontend/SubStream.cpp
index 5289ea9..e8bd23a 100644
--- a/src/graph/CL/CLMap.cpp
+++ b/src/graph/frontend/SubStream.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,23 +21,39 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/CL/CLMap.h"
+#include "arm_compute/graph/frontend/SubStream.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/frontend/ILayer.h"
 
-using namespace arm_compute::graph;
-
-CLMap::CLMap(ITensorObject *tensor, bool blocking)
-    : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor())), _blocking(blocking)
+namespace arm_compute
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
+namespace graph
+{
+namespace frontend
+{
+SubStream::SubStream(IStream &s)
+    : _s(s)
+{
+    _hints     = s.hints();
+    _tail_node = s.tail_node();
 }
 
-void CLMap::run()
+void SubStream::add_layer(ILayer &layer)
 {
-    _tensor->map(arm_compute::CLScheduler::get().queue(), _blocking);
+    auto nid   = layer.create_layer(*this);
+    _tail_node = nid;
 }
+
+const Graph &SubStream::graph() const
+{
+    return _s.graph();
+}
+
+Graph &SubStream::graph()
+{
+    return _s.graph();
+}
+} // namespace frontend
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/mutators/DepthConcatSubTensorMutator.cpp b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
new file mode 100644
index 0000000..c56f4c5
--- /dev/null
+++ b/src/graph/mutators/DepthConcatSubTensorMutator.cpp

@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/mutators/DepthConcatSubTensorMutator.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/nodes/DepthConcatenateLayerNode.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/core/utils/misc/Iterable.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+const char *DepthConcatSubTensorMutator::name()
+{
+    return "DepthConcatSubTensorMutator";
+}
+
+void DepthConcatSubTensorMutator::mutate(Graph &g)
+{
+    // Should be in reverse order of execution
+    for(auto &node : arm_compute::utils::iterable::reverse_iterate(g.nodes()))
+    {
+        if(node && node->type() == NodeType::DepthConcatenateLayer && node->output(0) != nullptr)
+        {
+            // Get output tensor
+            auto output_tensor = node->output(0);
+
+            // Check that all tensor have the same target and valid inputs
+            bool is_valid = std::all_of(node->input_edges().cbegin(), node->input_edges().cend(),
+                                        [&](const EdgeID & eid)
+            {
+                return (g.edge(eid) != nullptr) && (g.edge(eid)->tensor() != nullptr) && (g.edge(eid)->tensor()->desc().target == output_tensor->desc().target);
+            });
+
+            // Create subtensors
+            if(is_valid && backends::BackendRegistry::get().find_backend(output_tensor->desc().target) != nullptr)
+            {
+                ARM_COMPUTE_LOG_GRAPH_VERBOSE("Using sub-tensors for the node with ID : "
+                                              << node->id() << " and name : " << node->name() << std::endl);
+                // Create sub-tensor handles
+                unsigned depth = 0;
+                for(unsigned int i = 0; i < node->input_edges().size(); ++i)
+                {
+                    auto       input_tensor = node->input(i);
+                    const auto input_shape  = input_tensor->desc().shape;
+
+                    auto backend = backends::BackendRegistry::get().find_backend(input_tensor->desc().target);
+                    auto handle  = backend->create_subtensor(output_tensor->handle(), input_shape, Coordinates(0, 0, depth), false);
+                    input_tensor->set_handle(std::move(handle));
+
+                    depth += input_shape.z();
+                }
+
+                auto *dc_node = arm_compute::utils::cast::polymorphic_downcast<DepthConcatenateLayerNode *>(node.get());
+                dc_node->set_enabled(false);
+            }
+        }
+    }
+}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/mutators/InPlaceOperationMutator.cpp b/src/graph/mutators/InPlaceOperationMutator.cpp
new file mode 100644
index 0000000..bd3f098
--- /dev/null
+++ b/src/graph/mutators/InPlaceOperationMutator.cpp

@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/mutators/InPlaceOperationMutator.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Logger.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+const char *InPlaceOperationMutator::name()
+{
+    return "InPlaceOperationMutator";
+}
+
+void InPlaceOperationMutator::mutate(Graph &g)
+{
+    std::set<NodeType> in_place_nodes = { NodeType::BatchNormalizationLayer, NodeType::ActivationLayer };
+
+    // Not interested in the order of nodes
+    for(auto &node : g.nodes())
+    {
+        if(node && in_place_nodes.find(node->type()) != std::end(in_place_nodes))
+        {
+            // Get input edge
+            Edge *input_edge = node->input_edge(0);
+
+            // Check if parent has a single output if yes then force in place calculation else not
+            if((input_edge != nullptr) && (input_edge->producer() != nullptr) && (input_edge->producer()->output_edges().size() == 1))
+            {
+                ARM_COMPUTE_LOG_GRAPH_VERBOSE("Switching to in-place computation for the node with ID : "
+                                              << node->id() << " and name : " << node->name() << std::endl);
+                // Update output
+                auto tensor = input_edge->tensor();
+                node->set_output_tensor(tensor->id(), 0);
+            }
+        }
+    }
+}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
new file mode 100644
index 0000000..2e893c2
--- /dev/null
+++ b/src/graph/mutators/NodeFusionMutator.cpp

@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/mutators/NodeFusionMutator.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+namespace detail
+{
+void fuse_batch_norm_with_activation(Graph &g)
+{
+    // Not interested in the order of nodes
+    for(auto &node : g.nodes())
+    {
+        // Check if the node is batch norm and not a branching node
+        if(node && node->type() == NodeType::BatchNormalizationLayer && node->output_edges().size() == 1)
+        {
+            auto output_edge_id = *node->output_edges().begin();
+            auto output_edge    = g.edge(output_edge_id);
+            // Check if following node is an activation layer node
+            if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && (output_edge->consumer()->type() == NodeType::ActivationLayer))
+            {
+                ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing Batch Normalization node with ID : " << output_edge->producer_id()
+                                              << " with Activation Layer node with ID : " << output_edge->consumer_id() << std::endl);
+
+                auto *bn_node  = arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->producer());
+                auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(output_edge->consumer());
+
+                // Get driving nodes of activation node
+                std::vector<NodeIdxPair> act_driving_nodes;
+                for(auto &act_output_edge_id : act_node->output_edges())
+                {
+                    auto act_output_edge = g.edge(act_output_edge_id);
+                    if(act_output_edge != nullptr)
+                    {
+                        ARM_COMPUTE_ERROR_ON(act_output_edge->consumer() == nullptr);
+                        act_driving_nodes.push_back({ act_output_edge->consumer_id(), act_output_edge->consumer_idx() });
+                    }
+                }
+
+                // Set activation info to batch normalization
+                bn_node->set_fused_activation(act_node->activation_info());
+
+                // Remove activation node
+                g.remove_node(act_node->id());
+
+                // Update batch normalization node outputs
+                for(auto &driving_node : act_driving_nodes)
+                {
+                    g.add_connection(bn_node->id(), 0, driving_node.node_id, driving_node.index);
+                }
+            }
+        }
+    }
+}
+} // namespace detail
+
+const char *NodeFusionMutator::name()
+{
+    return "NodeFusionMutator";
+}
+
+void NodeFusionMutator::mutate(Graph &g)
+{
+    detail::fuse_batch_norm_with_activation(g);
+}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/mutators/SplitLayerSubTensorMutator.cpp b/src/graph/mutators/SplitLayerSubTensorMutator.cpp
new file mode 100644
index 0000000..2a8c029
--- /dev/null
+++ b/src/graph/mutators/SplitLayerSubTensorMutator.cpp

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/mutators/SplitLayerSubTensorMutator.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/nodes/SplitLayerNode.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/core/utils/misc/Iterable.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+const char *SplitLayerSubTensorMutator::name()
+{
+    return "SplitLayerSubTensorMutator";
+}
+
+void SplitLayerSubTensorMutator::mutate(Graph &g)
+{
+    // Should be in reverse order of execution
+    for(auto &node : arm_compute::utils::iterable::reverse_iterate(g.nodes()))
+    {
+        if(node && node->type() == NodeType::SplitLayer && node->input(0) != nullptr)
+        {
+            // Get output tensor
+            Tensor *input_tensor = node->input(0);
+
+            // Check that all tensor have the same target and are valid
+            bool is_valid = std::all_of(node->outputs().cbegin(), node->outputs().cend(),
+                                        [&](const TensorID & tid)
+            {
+                return (g.tensor(tid) != nullptr) && (g.tensor(tid)->desc().target == input_tensor->desc().target);
+            });
+
+            // Create subtensors
+            if(is_valid && backends::BackendRegistry::get().find_backend(input_tensor->desc().target) != nullptr)
+            {
+                ARM_COMPUTE_LOG_GRAPH_VERBOSE("Using sub-tensors for the node with ID : "
+                                              << node->id() << " and name : " << node->name() << std::endl);
+
+                auto *split_node = arm_compute::utils::cast::polymorphic_downcast<SplitLayerNode *>(node.get());
+
+                const unsigned int axis          = split_node->axis();
+                const unsigned int num_splits    = split_node->num_splits();
+                const bool         extend_parent = (axis < 2);
+
+                // Create sub-tensor handles
+                for(unsigned int i = 0; i < node->outputs().size(); ++i)
+                {
+                    Tensor           *output_tensor = node->output(i);
+                    const TensorShape output_shape  = output_tensor->desc().shape;
+                    Coordinates       coords;
+                    std::tie(std::ignore, coords) = SplitLayerNode::compute_output_descriptor(input_tensor->desc(), num_splits, axis, i);
+
+                    backends::IDeviceBackend      *backend = backends::BackendRegistry::get().find_backend(output_tensor->desc().target);
+                    std::unique_ptr<ITensorHandle> handle  = backend->create_subtensor(input_tensor->handle(), output_shape, coords, extend_parent);
+                    output_tensor->set_handle(std::move(handle));
+                }
+            }
+        }
+    }
+}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/nodes/ActivationLayer.cpp b/src/graph/nodes/ActivationLayer.cpp
deleted file mode 100644
index 546c42a..0000000
--- a/src/graph/nodes/ActivationLayer.cpp
+++ /dev/null

@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/ActivationLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-ActivationLayer::ActivationLayer(const ActivationLayerInfo activation_info)
-    : _activation_info(activation_info)
-{
-    set_supports_in_place(true);
-}
-
-std::unique_ptr<arm_compute::IFunction> ActivationLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
-    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
-    arm_compute::ITensor *in  = input->tensor();
-    arm_compute::ITensor *out = output->tensor();
-    _target_hint              = ctx.hints().target_hint();
-
-    // Create node context
-    NodeContext node_ctx(OperationType::ActivationLayer);
-    node_ctx.set_target(_target_hint);
-    node_ctx.add_input(in);
-    node_ctx.add_output(out);
-    node_ctx.add_parameter<ActivationLayerInfo>("ActivationLayerInfo", _activation_info);
-
-    // Get function
-    return OperationRegistry::get().find_operation(OperationType::ActivationLayer, _target_hint)->configure(node_ctx);
-}

diff --git a/src/graph/nodes/ActivationLayerNode.cpp b/src/graph/nodes/ActivationLayerNode.cpp
new file mode 100644
index 0000000..414684c
--- /dev/null
+++ b/src/graph/nodes/ActivationLayerNode.cpp

@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ActivationLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+ActivationLayerNode::ActivationLayerNode(ActivationLayerInfo info)
+    : _info(info)
+{
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+ActivationLayerInfo ActivationLayerNode::activation_info() const
+{
+    return _info;
+}
+
+bool ActivationLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor ActivationLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    return src->desc();
+}
+
+NodeType ActivationLayerNode::type() const
+{
+    return NodeType::ActivationLayer;
+}
+
+void ActivationLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/nodes/BatchNormalizationLayer.cpp b/src/graph/nodes/BatchNormalizationLayer.cpp
deleted file mode 100644
index 24287ac..0000000
--- a/src/graph/nodes/BatchNormalizationLayer.cpp
+++ /dev/null

@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/BatchNormalizationLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-std::unique_ptr<arm_compute::IFunction> BatchNormalizationLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
-    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
-    arm_compute::ITensor *in  = input->tensor();
-    arm_compute::ITensor *out = output->tensor();
-    _target_hint              = ctx.hints().target_hint();
-
-    unsigned int batch_norm_size = in->info()->dimension(2);
-    if(_mean.tensor() == nullptr)
-    {
-        _mean.set_info(TensorInfo(TensorShape(batch_norm_size), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
-    }
-    if(_var.tensor() == nullptr)
-    {
-        _var.set_info(TensorInfo(TensorShape(batch_norm_size), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
-    }
-    if(_beta.tensor() == nullptr)
-    {
-        _beta.set_info(TensorInfo(TensorShape(batch_norm_size), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
-    }
-    if(_gamma.tensor() == nullptr)
-    {
-        _gamma.set_info(TensorInfo(TensorShape(batch_norm_size), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
-    }
-
-    bool mean_is_loaded  = _mean.tensor() != nullptr;
-    bool var_is_loaded   = _var.tensor() != nullptr;
-    bool gamma_is_loaded = _gamma.tensor() != nullptr;
-    bool beta_is_loaded  = _beta.tensor() != nullptr;
-
-    // Set mean, var, gamma and beta target
-    _mean.set_target(_target_hint);
-    _var.set_target(_target_hint);
-    _gamma.set_target(_target_hint);
-    _beta.set_target(_target_hint);
-
-    // Create node context
-    NodeContext node_ctx(OperationType::BatchNormalizationLayer);
-    node_ctx.set_target(_target_hint);
-    node_ctx.add_input(in);
-    node_ctx.add_input(_mean.tensor());
-    node_ctx.add_input(_var.tensor());
-    node_ctx.add_input(_beta.tensor());
-    node_ctx.add_input(_gamma.tensor());
-    node_ctx.add_output(out);
-    node_ctx.add_parameter<float>("epsilon", _epsilon);
-    node_ctx.add_parameter<ActivationLayerInfo>("act_info", _act_info);
-
-    // Configure operation
-    auto func = OperationRegistry::get().find_operation(OperationType::BatchNormalizationLayer, _target_hint)->configure(node_ctx);
-
-    // Fill tensors
-    if(!mean_is_loaded)
-    {
-        _mean.allocate_and_fill_if_needed();
-    }
-    if(!var_is_loaded)
-    {
-        _var.allocate_and_fill_if_needed();
-    }
-    if(!gamma_is_loaded)
-    {
-        _gamma.allocate_and_fill_if_needed();
-    }
-    if(!beta_is_loaded)
-    {
-        _beta.allocate_and_fill_if_needed();
-    }
-
-    // Get function
-    return func;
-}
\ No newline at end of file

diff --git a/src/graph/nodes/BatchNormalizationLayerNode.cpp b/src/graph/nodes/BatchNormalizationLayerNode.cpp
new file mode 100644
index 0000000..3ae11fc
--- /dev/null
+++ b/src/graph/nodes/BatchNormalizationLayerNode.cpp

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/BatchNormalizationLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+BatchNormalizationLayerNode::BatchNormalizationLayerNode(float epsilon, ActivationLayerInfo fused_activation)
+    : _epsilon(epsilon), _fused_activation(fused_activation)
+{
+    _input_edges.resize(5, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+float BatchNormalizationLayerNode::epsilon() const
+{
+    return _epsilon;
+}
+
+ActivationLayerInfo BatchNormalizationLayerNode::fused_activation() const
+{
+    return _fused_activation;
+}
+
+void BatchNormalizationLayerNode::set_fused_activation(ActivationLayerInfo fused_activation)
+{
+    _fused_activation = fused_activation;
+}
+
+bool BatchNormalizationLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor BatchNormalizationLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    return src->desc();
+}
+
+NodeType BatchNormalizationLayerNode::type() const
+{
+    return NodeType::BatchNormalizationLayer;
+}
+
+void BatchNormalizationLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/nodes/BranchLayer.cpp b/src/graph/nodes/BranchLayer.cpp
deleted file mode 100644
index 7a20a56..0000000
--- a/src/graph/nodes/BranchLayer.cpp
+++ /dev/null

@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/BranchLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/SubGraph.h"
-#include "arm_compute/graph/Tensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "support/ToolchainSupport.h"
-#include "utils/TypePrinter.h"
-
-#include <memory>
-#include <tuple>
-#include <vector>
-
-using namespace arm_compute::graph;
-
-/** Branch function */
-class BranchFunction final : public arm_compute::IFunction
-{
-public:
-    /** Default Constructor */
-    BranchFunction()
-        : _graphs()
-    {
-    }
-    /** Registers graph to be executed by the branch function
-     *
-     * @param[in] graph Graph to register
-     */
-    void register_graph(std::unique_ptr<Graph> graph)
-    {
-        _graphs.push_back(std::move(graph));
-    }
-    // Inherited methods overriden:
-    void run() override
-    {
-        for(auto &g : _graphs)
-        {
-            ARM_COMPUTE_ERROR_ON(g.get() == nullptr);
-            g->run();
-        }
-    }
-
-private:
-    std::vector<std::unique_ptr<Graph>> _graphs;
-};
-
-std::unique_ptr<arm_compute::IFunction> BranchLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
-    ARM_COMPUTE_ERROR_ON(_branch_merge_method != BranchMergeMethod::DEPTH_CONCATENATE);
-    ARM_COMPUTE_UNUSED(_branch_merge_method);
-    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
-    // Create branch function
-    auto func = arm_compute::support::cpp14::make_unique<BranchFunction>();
-
-    // Track output depth
-    int depth = 0;
-
-    // Constuct all sub-graphs given the input/output
-    for(auto &sg : _sub_graphs)
-    {
-        ARM_COMPUTE_ERROR_ON(sg.get() == nullptr);
-
-        // IO buffers
-        std::unique_ptr<ITensorObject> in;
-        std::unique_ptr<ITensorObject> out;
-        SubTensor                     *out_sub_tensor = nullptr;
-
-        // Create input sub-tensor
-        if(!sg->has_input())
-        {
-            ARM_COMPUTE_ERROR_ON(dynamic_cast<Tensor *>(input) == nullptr);
-            in = arm_compute::support::cpp14::make_unique<SubTensor>(*dynamic_cast<Tensor *>(input),
-                                                                     input->tensor()->info()->tensor_shape(),
-                                                                     Coordinates());
-        }
-
-        // Create output sub-tensor
-        if(!sg->has_output())
-        {
-            ARM_COMPUTE_ERROR_ON((dynamic_cast<Tensor *>(output) == nullptr) && (dynamic_cast<SubTensor *>(output) == nullptr));
-
-            out = arm_compute::support::cpp14::make_unique<SubTensor>(output->tensor(),
-                                                                      TensorShape(),
-                                                                      Coordinates(0, 0, depth),
-                                                                      output->target(),
-                                                                      true);
-            out_sub_tensor = dynamic_cast<SubTensor *>(out.get());
-        }
-
-        // Construct sub_graph
-        auto g = sg->construct(ctx, std::move(in), std::move(out));
-
-        // Register graph to function
-        func->register_graph(std::move(g));
-
-        // Update and track depth
-        if(out_sub_tensor != nullptr)
-        {
-            ARM_COMPUTE_ERROR_ON(out_sub_tensor->tensor() == nullptr);
-            depth += out_sub_tensor->tensor()->info()->tensor_shape()[2];
-        }
-    }
-
-    return std::move(func);
-}
\ No newline at end of file

diff --git a/src/graph/nodes/ConstNode.cpp b/src/graph/nodes/ConstNode.cpp
new file mode 100644
index 0000000..2f3cd14
--- /dev/null
+++ b/src/graph/nodes/ConstNode.cpp

@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ConstNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+ConstNode::ConstNode(TensorDescriptor desc)
+    : _desc(std::move(desc))
+{
+    _outputs.resize(1, NullTensorID);
+}
+
+bool ConstNode::forward_descriptors()
+{
+    if(output_id(0) != NullTensorID)
+    {
+        Tensor *t = output(0);
+        ARM_COMPUTE_ERROR_ON(t == nullptr);
+        t->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor ConstNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    return _desc;
+}
+
+NodeType ConstNode::type() const
+{
+    return NodeType::Const;
+}
+
+void ConstNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/nodes/ConvolutionLayer.cpp b/src/graph/nodes/ConvolutionLayer.cpp
deleted file mode 100644
index f292b89..0000000
--- a/src/graph/nodes/ConvolutionLayer.cpp
+++ /dev/null

@@ -1,344 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/ConvolutionLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "support/ToolchainSupport.h"
-#include "utils/GraphTypePrinter.h"
-#include "utils/TypePrinter.h"
-
-#include <tuple>
-#include <vector>
-
-using namespace arm_compute::graph;
-
-namespace
-{
-/** Calculates the output shaped of the convolution layer
- *
- * @param[in] input_shape   Input tensor shape
- * @param[in] weights_shape Weights shape
- * @param[in] conv_info     Convolution information (padding, stride, etc.)
- *
- * @return The expected output tensor shape
- */
-TensorShape calculate_convolution_layer_output_shape(const TensorShape &input_shape, const TensorShape &weights_shape, const PadStrideInfo &conv_info)
-{
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
-
-    // Get output width and height
-    std::tie(output_width, output_height) = arm_compute::scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
-
-    // Create output shape
-    TensorShape output_shape = input_shape;
-    output_shape.set(0, output_width);
-    output_shape.set(1, output_height);
-    output_shape.set(2, weights_shape[3]);
-
-    return output_shape;
-}
-
-// Instantiate GEMM based convolution layer
-template <typename ConvolutionType, typename TensorType, TargetHint target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(arm_compute::ITensor *input, arm_compute::ITensor *weights, arm_compute::ITensor *biases, arm_compute::ITensor *output,
-                                                             const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
-{
-    auto conv = arm_compute::support::cpp14::make_unique<ConvolutionType>();
-    conv->configure(
-        dynamic_cast<TensorType *>(input),
-        dynamic_cast<TensorType *>(weights),
-        dynamic_cast<TensorType *>(biases),
-        dynamic_cast<TensorType *>(output),
-        conv_info, weights_info);
-    return std::move(conv);
-}
-
-// Instantiate direct convolution layer
-template <typename ConvolutionType, typename TensorType, TargetHint target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate_direct_function(arm_compute::ITensor *input, arm_compute::ITensor *weights, arm_compute::ITensor *biases, arm_compute::ITensor *output,
-                                                                    const PadStrideInfo &conv_info)
-{
-    auto conv = arm_compute::support::cpp14::make_unique<ConvolutionType>();
-    conv->configure(
-        dynamic_cast<TensorType *>(input),
-        dynamic_cast<TensorType *>(weights),
-        dynamic_cast<TensorType *>(biases),
-        dynamic_cast<TensorType *>(output),
-        conv_info);
-    return std::move(conv);
-}
-
-template <TargetHint                    target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate(arm_compute::ITensor *input, arm_compute::ITensor *weights, arm_compute::ITensor *biases, arm_compute::ITensor *output,
-                                                    const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                                    ConvolutionMethodHint conv_method);
-
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(arm_compute::ITensor *input, arm_compute::ITensor *weights, arm_compute::ITensor *biases, arm_compute::ITensor *output,
-                                                                        const PadStrideInfo &conv_info,
-                                                                        const WeightsInfo    &weights_info,
-                                                                        ConvolutionMethodHint conv_method)
-{
-    if((conv_method == ConvolutionMethodHint::DIRECT)
-       && arm_compute::CLDirectConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info)) // NOLINT
-    {
-        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLDirectConvolutionLayer");
-        return instantiate_direct_function<arm_compute::CLDirectConvolutionLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, weights, biases, output, conv_info);
-    }
-    else
-    {
-        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLConvolutionLayer");
-        return instantiate_function<arm_compute::CLConvolutionLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, weights, biases, output, conv_info, weights_info);
-    }
-}
-
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(arm_compute::ITensor *input, arm_compute::ITensor *weights, arm_compute::ITensor *biases, arm_compute::ITensor *output,
-                                                                      const PadStrideInfo &conv_info,
-                                                                      const WeightsInfo    &weights_info,
-                                                                      ConvolutionMethodHint conv_method)
-{
-    if((conv_method == ConvolutionMethodHint::DIRECT)
-       && arm_compute::NEDirectConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info)) // NOLINT
-    {
-        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEDirectConvolutionLayer");
-        return instantiate_direct_function<arm_compute::NEDirectConvolutionLayer, arm_compute::ITensor, TargetHint::NEON>(input, weights, biases, output, conv_info);
-    }
-    else
-    {
-        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEConvolutionLayer");
-        return instantiate_function<arm_compute::NEConvolutionLayer, arm_compute::ITensor, TargetHint::NEON>(input, weights, biases, output, conv_info, weights_info);
-    }
-}
-} // namespace
-
-/** Grouped Convolution function */
-class GroupedConvolutionFunction final : public arm_compute::IFunction
-{
-public:
-    /** Default Constructor */
-    GroupedConvolutionFunction()
-        : _convolutions()
-    {
-    }
-    /** Default Destructor */
-    ~GroupedConvolutionFunction() final = default;
-    /** Prevent instances from being copy constructed */
-    GroupedConvolutionFunction(const GroupedConvolutionFunction &) = delete;
-    /** Prevent instances from being copy assigned */
-    GroupedConvolutionFunction &operator=(const GroupedConvolutionFunction &) = delete;
-    /** Allow instances to be move constructed */
-    GroupedConvolutionFunction(GroupedConvolutionFunction &&) noexcept = default;
-    /** Allow instances to be move assigned */
-    GroupedConvolutionFunction &operator=(GroupedConvolutionFunction &&) noexcept = default;
-    /** Adds a convolution
-     *
-     * @param convolution Convolution function to add
-     */
-    void add_convolution_function(std::unique_ptr<IFunction> convolution)
-    {
-        _convolutions.emplace_back(std::move(convolution));
-    }
-
-    // Inherited methods overriden:
-    void run() override
-    {
-        for(auto &c : _convolutions)
-        {
-            c->run();
-        }
-    }
-
-private:
-    std::vector<std::unique_ptr<IFunction>> _convolutions;
-};
-
-std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
-    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
-    arm_compute::ITensor *in  = input->tensor();
-    arm_compute::ITensor *out = output->tensor();
-
-    // Set weights and biases info
-    if(_weights.tensor() == nullptr)
-    {
-        TensorInfo info = TensorInfo(TensorShape(_conv_width, _conv_height, in->info()->dimension(2) / _num_groups, _ofm),
-                                     in->info()->num_channels(),
-                                     in->info()->data_type(),
-                                     in->info()->fixed_point_position());
-        info.set_quantization_info(_weights_quant_info);
-        _weights.set_info(std::move(info));
-    }
-    if(_biases.has_accessor() && _biases.tensor() == nullptr)
-    {
-        DataType dt = in->info()->data_type();
-        _biases.set_info(TensorInfo(TensorShape(_ofm), in->info()->num_channels(), is_data_type_quantized_asymmetric(dt) ? DataType::S32 : dt, in->info()->fixed_point_position()));
-    }
-
-    std::unique_ptr<arm_compute::IFunction> func;
-    _target_hint                                 = ctx.hints().target_hint();
-    const ConvolutionMethodHint conv_method_hint = ctx.hints().convolution_method_hint();
-
-    // Check if the weights and biases are loaded
-    bool weights_are_loaded = _weights.tensor() != nullptr;
-    bool biases_are_loaded  = _biases.has_accessor() ? _biases.tensor() != nullptr : true;
-
-    // Set bias and weights target
-    _weights.set_target(_target_hint);
-    if(_biases.has_accessor())
-    {
-        _biases.set_target(_target_hint);
-    }
-
-    // Calculate output shape
-    TensorShape output_shape = calculate_convolution_layer_output_shape(in->info()->tensor_shape(), _weights.info().tensor_shape(), _conv_info);
-
-    // Output auto inizialitation if not yet initialized
-    arm_compute::auto_init_if_empty(*out->info(), output_shape, 1, in->info()->data_type(), in->info()->fixed_point_position(),
-                                    (_out_quant_info.empty()) ? in->info()->quantization_info() : _out_quant_info);
-
-    // Create appropriate convolution function
-    if(_num_groups == 1)
-    {
-        func = instantiate_convolution(in, out, conv_method_hint);
-    }
-    else
-    {
-        func = instantiate_grouped_convolution(in, out, conv_method_hint);
-    }
-
-    // Fill weights
-    if(!weights_are_loaded)
-    {
-        _weights.allocate_and_fill_if_needed();
-    }
-    // Fill biases
-    if(!biases_are_loaded)
-    {
-        _biases.allocate_and_fill_if_needed();
-    }
-
-    ARM_COMPUTE_LOG_GRAPH_INFO(" Data Type: " << in->info()->data_type()
-                               << " Input Shape: " << in->info()->tensor_shape()
-                               << " Weights shape: " << _weights.info().tensor_shape()
-                               << " Biases Shape: " << _biases.info().tensor_shape()
-                               << " Output Shape: " << out->info()->tensor_shape()
-                               << " PadStrideInfo: " << _conv_info
-                               << " Groups: " << _num_groups
-                               << " WeightsInfo: " << _weights_info
-                               << std::endl);
-
-    return func;
-}
-
-std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_convolution(ITensor *input, ITensor *output, ConvolutionMethodHint conv_method_hint)
-{
-    std::unique_ptr<arm_compute::IFunction> func;
-    if(_target_hint == TargetHint::OPENCL)
-    {
-        func = instantiate<TargetHint::OPENCL>(input, _weights.tensor(), _biases.tensor(), output, _conv_info, _weights_info, conv_method_hint);
-    }
-    else
-    {
-        func = instantiate<TargetHint::NEON>(input, _weights.tensor(), _biases.tensor(), output, _conv_info, _weights_info, conv_method_hint);
-    }
-    return func;
-}
-
-std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_grouped_convolution(ITensor *input, ITensor *output, ConvolutionMethodHint conv_method_hint)
-{
-    // Get tensor shapes
-    TensorShape input_shape   = input->info()->tensor_shape();
-    TensorShape output_shape  = output->info()->tensor_shape();
-    TensorShape weights_shape = _weights.info().tensor_shape();
-    TensorShape biases_shape  = _biases.info().tensor_shape();
-
-    ARM_COMPUTE_ERROR_ON_MSG((input_shape.z() % _num_groups) != 0, "Input depth not multiple of the number of groups!");
-    ARM_COMPUTE_ERROR_ON_MSG((output_shape.z() % _num_groups) != 0, "Output depth not multiple of the number of groups!");
-    ARM_COMPUTE_ERROR_ON_MSG((weights_shape[3] % _num_groups) != 0, "Number of kernels not multiple of the number of groups!");
-    ARM_COMPUTE_ERROR_ON_MSG((biases_shape.x() % _num_groups) != 0, "Biases not multiple of the number of groups!");
-
-    // Create a grouped convolution function
-    auto grouped_conv = arm_compute::support::cpp14::make_unique<GroupedConvolutionFunction>();
-
-    // Create sub-tensors vectors
-    _is = arm_compute::support::cpp14::make_unique<SubTensor[]>(_num_groups);
-    _os = arm_compute::support::cpp14::make_unique<SubTensor[]>(_num_groups);
-    _ws = arm_compute::support::cpp14::make_unique<SubTensor[]>(_num_groups);
-    _bs = arm_compute::support::cpp14::make_unique<SubTensor[]>(_num_groups);
-
-    // Calculate sub-tensor splits
-    const int input_split   = input_shape.z() / _num_groups;
-    const int output_split  = output_shape.z() / _num_groups;
-    const int weights_split = weights_shape[3] / _num_groups;
-    const int biases_split  = biases_shape.x() / _num_groups;
-
-    // Calculate sub-tensor shapes
-    input_shape.set(2, input_split);
-    output_shape.set(2, output_split);
-    weights_shape.set(3, weights_split);
-    biases_shape.set(0, biases_split);
-
-    // Configure sub-tensors
-    for(int i = 0; i < static_cast<int>(_num_groups); ++i)
-    {
-        // Create convolution function
-        std::unique_ptr<arm_compute::IFunction> func;
-
-        // Calculate sub-tensors starting coordinates
-        Coordinates input_coord(0, 0, input_split * i);
-        Coordinates output_coord(0, 0, output_split * i);
-        Coordinates weights_coord(0, 0, 0, weights_split * i);
-        Coordinates biases_coord(biases_split * i);
-
-        // Create sub-tensors for input, output, weights and bias
-        auto hint_to_use = (_target_hint == TargetHint::OPENCL) ? TargetHint::OPENCL : TargetHint::NEON;
-        _is[i]           = SubTensor(input, input_shape, input_coord, hint_to_use);
-        _os[i]           = SubTensor(output, output_shape, output_coord, hint_to_use);
-        _ws[i]           = SubTensor(_weights.tensor(), weights_shape, weights_coord, hint_to_use);
-        _bs[i]           = SubTensor(_biases.tensor(), biases_shape, biases_coord, hint_to_use);
-
-        // Instantiate convolution function
-        if(_target_hint == TargetHint::OPENCL)
-        {
-            func = instantiate<TargetHint::OPENCL>(_is[i].tensor(), _ws[i].tensor(), _bs[i].tensor(), _os[i].tensor(), _conv_info, _weights_info, conv_method_hint);
-        }
-        else
-        {
-            func = instantiate<TargetHint::NEON>(_is[i].tensor(), _ws[i].tensor(), _bs[i].tensor(), _os[i].tensor(), _conv_info, _weights_info, conv_method_hint);
-        }
-
-        // Add convolution function to the list of convolutions for the grouped convolution
-        grouped_conv->add_convolution_function(std::move(func));
-    }
-
-    return std::move(grouped_conv);
-}

diff --git a/src/graph/nodes/ConvolutionLayerNode.cpp b/src/graph/nodes/ConvolutionLayerNode.cpp
new file mode 100644
index 0000000..6c31a6b
--- /dev/null
+++ b/src/graph/nodes/ConvolutionLayerNode.cpp

@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ConvolutionLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+ConvolutionLayerNode::ConvolutionLayerNode(PadStrideInfo info, ConvolutionMethod method, FastMathHint fast_math_hint, QuantizationInfo out_quant_info)
+    : _info(std::move(info)), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(out_quant_info)
+{
+    _input_edges.resize(3, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+void ConvolutionLayerNode::set_convolution_method(ConvolutionMethod method)
+{
+    _method = method;
+}
+
+ConvolutionMethod ConvolutionLayerNode::convolution_method() const
+{
+    return _method;
+}
+
+void ConvolutionLayerNode::set_fast_math_hint(FastMathHint hint)
+{
+    _fast_math_hint = hint;
+}
+
+FastMathHint ConvolutionLayerNode::fast_math_hint() const
+{
+    return _fast_math_hint;
+}
+
+PadStrideInfo ConvolutionLayerNode::convolution_info() const
+{
+    return _info;
+}
+
+TensorDescriptor ConvolutionLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                                 const TensorDescriptor &weights_descriptor,
+                                                                 const PadStrideInfo    &info)
+{
+    unsigned int output_width  = 0;
+    unsigned int output_height = 0;
+
+    const unsigned int input_width   = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
+    const unsigned int input_height  = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+    const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
+    const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
+
+    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+
+    TensorDescriptor output_descriptor = input_descriptor;
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), output_width);
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), output_height);
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+
+    return output_descriptor;
+}
+
+bool ConvolutionLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor ConvolutionLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    const Tensor *src     = input(0);
+    const Tensor *weights = input(1);
+
+    ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
+
+    TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
+    if(!_out_quant_info.empty())
+    {
+        output_info.quant_info = _out_quant_info;
+    }
+
+    return output_info;
+}
+
+NodeType ConvolutionLayerNode::type() const
+{
+    return NodeType::ConvolutionLayer;
+}
+
+void ConvolutionLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/nodes/DeQuantizationLayer.cpp b/src/graph/nodes/DeQuantizationLayer.cpp
deleted file mode 100644
index af9ecee..0000000
--- a/src/graph/nodes/DeQuantizationLayer.cpp
+++ /dev/null

@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/DequantizationLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-
-using namespace arm_compute::graph;
-
-std::unique_ptr<arm_compute::IFunction> DequantizationLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
-    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
-    _target_hint              = ctx.hints().target_hint();
-    arm_compute::ITensor *in  = input->tensor();
-    arm_compute::ITensor *out = output->tensor();
-
-    if(_min_max.tensor() == nullptr)
-    {
-        TensorShape shape = in->info()->tensor_shape();
-        shape.set(Window::DimX, 2);
-        shape.remove_dimension(1);
-        shape.remove_dimension(1);
-
-        _min_max.set_info(TensorInfo(shape, in->info()->num_channels(), DataType::F32));
-        _min_max.set_target(_target_hint);
-    }
-
-    bool minmax_is_loaded = _min_max.tensor() != nullptr;
-
-    // Create node context
-    NodeContext node_ctx(OperationType::DequantizationLayer);
-    node_ctx.set_target(_target_hint);
-    node_ctx.add_input(in);
-    node_ctx.add_output(_min_max.tensor());
-    node_ctx.add_output(out);
-
-    // Fill min max
-    if(!minmax_is_loaded)
-    {
-        _min_max.allocate_and_fill_if_needed();
-    }
-
-    // Get function
-    return OperationRegistry::get().find_operation(OperationType::DequantizationLayer, _target_hint)->configure(node_ctx);
-}

diff --git a/src/graph/nodes/DepthConcatenateLayerNode.cpp b/src/graph/nodes/DepthConcatenateLayerNode.cpp
new file mode 100644
index 0000000..08cccc1
--- /dev/null
+++ b/src/graph/nodes/DepthConcatenateLayerNode.cpp

@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/DepthConcatenateLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+DepthConcatenateLayerNode::DepthConcatenateLayerNode(unsigned int total_nodes)
+    : _total_nodes(total_nodes), _is_enabled(true)
+{
+    _input_edges.resize(_total_nodes, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+void DepthConcatenateLayerNode::set_enabled(bool is_enabled)
+{
+    _is_enabled = is_enabled;
+}
+
+bool DepthConcatenateLayerNode::is_enabled() const
+{
+    return _is_enabled;
+}
+
+TensorDescriptor DepthConcatenateLayerNode::compute_output_descriptor(const std::vector<TensorDescriptor> &input_descriptors)
+{
+    ARM_COMPUTE_ERROR_ON(input_descriptors.size() == 0);
+
+    TensorDescriptor output_descriptor = input_descriptors[0];
+
+    size_t max_x = 0;
+    size_t max_y = 0;
+    size_t depth = 0;
+
+    for(const auto &input_descriptor : input_descriptors)
+    {
+        max_x = std::max(input_descriptor.shape.x(), max_x);
+        max_y = std::max(input_descriptor.shape.y(), max_y);
+        depth += input_descriptor.shape.z();
+    }
+
+    output_descriptor.shape.set(0, max_x);
+    output_descriptor.shape.set(1, max_y);
+    output_descriptor.shape.set(2, depth);
+
+    return output_descriptor;
+}
+
+bool DepthConcatenateLayerNode::forward_descriptors()
+{
+    if(_outputs[0] != NullTensorID)
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor DepthConcatenateLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    // Check if all input tensors are set
+    bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges), [](const EdgeID & eid)
+    {
+        return eid != EmptyEdgeID;
+    });
+
+    TensorDescriptor output_info = {};
+
+    if(are_all_inputs_set)
+    {
+        std::vector<TensorDescriptor> inputs_descriptors;
+        for(unsigned int i = 0; i < _input_edges.size(); ++i)
+        {
+            const Tensor *t = _graph->tensor(input_id(i));
+            ARM_COMPUTE_ERROR_ON(t == nullptr);
+            inputs_descriptors.push_back(t->desc());
+        }
+        output_info = compute_output_descriptor(inputs_descriptors);
+    }
+
+    return output_info;
+}
+
+NodeType DepthConcatenateLayerNode::type() const
+{
+    return NodeType::DepthConcatenateLayer;
+}
+
+void DepthConcatenateLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/nodes/DepthConvertLayer.cpp b/src/graph/nodes/DepthConvertLayer.cpp
deleted file mode 100644
index 9b328e7..0000000
--- a/src/graph/nodes/DepthConvertLayer.cpp
+++ /dev/null

@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/DepthConvertLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-
-using namespace arm_compute::graph;
-
-DepthConvertLayer::DepthConvertLayer(const ConvertPolicy policy, uint32_t shift, DataType output_datatype)
-    : _policy(policy), _shift(shift), _output_datatype(output_datatype)
-{
-}
-
-std::unique_ptr<arm_compute::IFunction> DepthConvertLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
-    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
-    _target_hint              = ctx.hints().target_hint();
-    arm_compute::ITensor *in  = input->tensor();
-    arm_compute::ITensor *out = output->tensor();
-
-    // Auto configure output
-    arm_compute::auto_init_if_empty(*out->info(), in->info()->tensor_shape(), 1, _output_datatype, in->info()->fixed_point_position());
-
-    // Create node context
-    NodeContext node_ctx(OperationType::DepthConvertLayer);
-    node_ctx.set_target(_target_hint);
-    node_ctx.add_input(in);
-    node_ctx.add_output(out);
-    node_ctx.add_parameter<ConvertPolicy>("ConvertPolicy", _policy);
-    node_ctx.add_parameter<uint32_t>("shift", _shift);
-
-    // Get function
-    return OperationRegistry::get().find_operation(OperationType::DepthConvertLayer, _target_hint)->configure(node_ctx);
-}

diff --git a/src/graph/nodes/DepthwiseConvolutionLayer.cpp b/src/graph/nodes/DepthwiseConvolutionLayer.cpp
deleted file mode 100644
index e5101cc..0000000
--- a/src/graph/nodes/DepthwiseConvolutionLayer.cpp
+++ /dev/null

@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/DepthwiseConvolutionLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-std::unique_ptr<arm_compute::IFunction> DepthwiseConvolutionLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
-    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
-    arm_compute::ITensor *in  = input->tensor();
-    arm_compute::ITensor *out = output->tensor();
-    _target_hint              = ctx.hints().target_hint();
-
-    if(_weights.tensor() == nullptr)
-    {
-        TensorShape weights_shape(_conv_width, _conv_height, input->tensor()->info()->tensor_shape().z());
-        TensorInfo  info = TensorInfo(TensorShape(weights_shape), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position());
-        info.set_quantization_info(_quant_info);
-        _weights.set_info(std::move(info));
-    }
-    if(_biases.has_accessor() && _biases.tensor() == nullptr)
-    {
-        DataType dt = in->info()->data_type();
-        _biases.set_info(TensorInfo(TensorShape(in->info()->dimension(2)), in->info()->num_channels(), is_data_type_quantized_asymmetric(dt) ? DataType::S32 : dt, in->info()->fixed_point_position()));
-    }
-
-    bool weights_is_loaded = _weights.tensor() != nullptr;
-    bool biases_is_loaded  = _biases.has_accessor() ? _biases.tensor() != nullptr : true;
-
-    _weights.set_target(_target_hint);
-    if(_biases.has_accessor())
-    {
-        _biases.set_target(_target_hint);
-    }
-
-    // Create node context
-    NodeContext node_ctx(OperationType::DepthwiseConvolutionLayer);
-    node_ctx.set_target(_target_hint);
-    node_ctx.add_input(in);
-    node_ctx.add_input(_weights.tensor());
-    if(_biases.has_accessor())
-    {
-        node_ctx.add_input(_biases.tensor());
-    }
-    node_ctx.add_output(out);
-    node_ctx.add_parameter<PadStrideInfo>("ConvolutionInfo", _conv_info);
-    node_ctx.add_parameter<bool>("Optimized3x3", _opt3x3);
-
-    // Configure operation
-    auto func = OperationRegistry::get().find_operation(OperationType::DepthwiseConvolutionLayer, _target_hint)->configure(node_ctx);
-
-    // Fill tensors
-    if(!weights_is_loaded)
-    {
-        _weights.allocate_and_fill_if_needed();
-    }
-    if(!biases_is_loaded)
-    {
-        _biases.allocate_and_fill_if_needed();
-    }
-
-    // Get function
-    return func;
-}

diff --git a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
new file mode 100644
index 0000000..1a6f8d3
--- /dev/null
+++ b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp

@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo info, DepthwiseConvolutionMethod method)
+    : _info(std::move(info)), _method(method)
+{
+    _input_edges.resize(3, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+void DepthwiseConvolutionLayerNode::set_depthwise_convolution_method(DepthwiseConvolutionMethod method)
+{
+    _method = method;
+}
+
+DepthwiseConvolutionMethod DepthwiseConvolutionLayerNode::depthwise_convolution_method() const
+{
+    return _method;
+}
+
+PadStrideInfo DepthwiseConvolutionLayerNode::convolution_info() const
+{
+    return _info;
+}
+
+TensorDescriptor DepthwiseConvolutionLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                                          const TensorDescriptor &weights_descriptor,
+                                                                          const PadStrideInfo    &info)
+{
+    unsigned int output_width  = 0;
+    unsigned int output_height = 0;
+
+    const unsigned int input_width   = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
+    const unsigned int input_height  = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+    const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
+    const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
+
+    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+
+    TensorDescriptor output_descriptor = input_descriptor;
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), output_width);
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), output_height);
+
+    return output_descriptor;
+}
+
+bool DepthwiseConvolutionLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor DepthwiseConvolutionLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    const Tensor *src     = input(0);
+    const Tensor *weights = input(1);
+
+    ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
+
+    return compute_output_descriptor(src->desc(), weights->desc(), _info);
+}
+
+NodeType DepthwiseConvolutionLayerNode::type() const
+{
+    return NodeType::DepthwiseConvolutionLayer;
+}
+
+void DepthwiseConvolutionLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/nodes/EltwiseLayerNode.cpp b/src/graph/nodes/EltwiseLayerNode.cpp
new file mode 100644
index 0000000..568b882
--- /dev/null
+++ b/src/graph/nodes/EltwiseLayerNode.cpp

@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/EltwiseLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+EltwiseLayerNode::EltwiseLayerNode(EltwiseOperation op, ConvertPolicy c_policy, RoundingPolicy r_policy)
+    : _op(op), _convert_policy(c_policy), _rounding_policy(r_policy)
+{
+    _input_edges.resize(2, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+EltwiseOperation EltwiseLayerNode::eltwise_operation() const
+{
+    return _op;
+}
+
+ConvertPolicy EltwiseLayerNode::convert_policy() const
+{
+    return _convert_policy;
+}
+
+RoundingPolicy EltwiseLayerNode::rounding_policy() const
+{
+    return _rounding_policy;
+}
+
+bool EltwiseLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor EltwiseLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx, _op, _convert_policy, _rounding_policy);
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    return src->desc();
+}
+
+NodeType EltwiseLayerNode::type() const
+{
+    return NodeType::EltwiseLayer;
+}
+
+void EltwiseLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/nodes/FlattenLayer.cpp b/src/graph/nodes/FlattenLayer.cpp
deleted file mode 100644
index ea08296..0000000
--- a/src/graph/nodes/FlattenLayer.cpp
+++ /dev/null

@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/FlattenLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-std::unique_ptr<arm_compute::IFunction> FlattenLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
-    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
-    _target_hint              = ctx.hints().target_hint();
-    arm_compute::ITensor *in  = input->tensor();
-    arm_compute::ITensor *out = output->tensor();
-
-    // Auto configure output
-    TensorShape tensor_shape = in->info()->tensor_shape();
-    tensor_shape.collapse(in->info()->num_dimensions());
-    arm_compute::auto_init_if_empty(*out->info(), tensor_shape, 1, in->info()->data_type(), in->info()->fixed_point_position());
-
-    // Create node context
-    NodeContext node_ctx(OperationType::FlattenLayer);
-    node_ctx.set_target(_target_hint);
-    node_ctx.add_input(in);
-    node_ctx.add_output(out);
-
-    // Get function
-    return OperationRegistry::get().find_operation(OperationType::FlattenLayer, _target_hint)->configure(node_ctx);
-}
\ No newline at end of file

diff --git a/src/graph/nodes/FlattenLayerNode.cpp b/src/graph/nodes/FlattenLayerNode.cpp
new file mode 100644
index 0000000..78b45dc
--- /dev/null
+++ b/src/graph/nodes/FlattenLayerNode.cpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/FlattenLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+FlattenLayerNode::FlattenLayerNode()
+{
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+bool FlattenLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor FlattenLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    TensorDescriptor output_desc = src->desc();
+    output_desc.shape.collapse(src->desc().shape.num_dimensions());
+
+    return output_desc;
+}
+
+NodeType FlattenLayerNode::type() const
+{
+    return NodeType::FlattenLayer;
+}
+
+void FlattenLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/nodes/FloorLayer.cpp b/src/graph/nodes/FloorLayer.cpp
deleted file mode 100644
index 8750546..0000000
--- a/src/graph/nodes/FloorLayer.cpp
+++ /dev/null

@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/FloorLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-std::unique_ptr<arm_compute::IFunction> FloorLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
-    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
-    arm_compute::ITensor *in  = input->tensor();
-    arm_compute::ITensor *out = output->tensor();
-    _target_hint              = ctx.hints().target_hint();
-
-    // Create node context
-    NodeContext node_ctx(OperationType::FloorLayer);
-    node_ctx.set_target(_target_hint);
-    node_ctx.add_input(in);
-    node_ctx.add_output(out);
-
-    // Get function
-    return OperationRegistry::get().find_operation(OperationType::FloorLayer, _target_hint)->configure(node_ctx);
-}

diff --git a/src/graph/nodes/FullyConnectedLayer.cpp b/src/graph/nodes/FullyConnectedLayer.cpp
index 3742150..d94a785 100644
--- a/src/graph/nodes/FullyConnectedLayer.cpp
+++ b/src/graph/nodes/FullyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,86 +21,89 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/nodes/FullyConnectedLayer.h"
+#include "arm_compute/graph/nodes/FullyConnectedLayerNode.h"
 
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
 
-using namespace arm_compute::graph;
-
-namespace
+namespace arm_compute
 {
-TensorShape calculate_fullyconnected_layer_output_shape(const TensorShape &input_shape, unsigned int output_neurons)
+namespace graph
+{
+FullyConnectedLayerNode::FullyConnectedLayerNode(unsigned int num_outputs)
+    : _num_outputs(num_outputs)
+{
+    _input_edges.resize(3, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+TensorDescriptor FullyConnectedLayerNode::compute_weights_descriptor(const TensorDescriptor &input_descriptor,
+                                                                     unsigned int            num_outputs)
+{
+    unsigned int num_weights    = 1;
+    unsigned int num_dimensions = input_descriptor.shape.num_dimensions();
+    // Ignore the batch dimension if there is one:
+    if(num_dimensions == 2 || num_dimensions == 4)
+    {
+        num_dimensions--;
+    }
+    for(unsigned int i = 0; i < num_dimensions; i++)
+    {
+        num_weights *= input_descriptor.shape[i];
+    }
+
+    TensorDescriptor weights_descriptor = input_descriptor;
+    weights_descriptor.shape            = TensorShape(num_weights, num_outputs);
+
+    return weights_descriptor;
+}
+
+TensorDescriptor FullyConnectedLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                                    unsigned int            num_outputs)
 {
     // Note: Only 1D batch space is supported at the moment
-    unsigned int batches = input_shape[1];
-    if(input_shape.num_dimensions() > 2)
+    unsigned int batches = input_descriptor.shape[1];
+    if(input_descriptor.shape.num_dimensions() > 2)
     {
-        batches = input_shape[3];
+        batches = input_descriptor.shape[3];
     }
-    return TensorShape(output_neurons, batches);
-}
-} // namespace
 
-std::unique_ptr<arm_compute::IFunction> FullyConnectedLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
+    TensorDescriptor output_descriptor = input_descriptor;
+    output_descriptor.shape            = TensorShape(num_outputs, batches);
+
+    return output_descriptor;
+}
+
+bool FullyConnectedLayerNode::forward_descriptors()
 {
-    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
-    arm_compute::ITensor *in  = input->tensor();
-    arm_compute::ITensor *out = output->tensor();
-    _target_hint              = ctx.hints().target_hint();
-
-    if(_weights.tensor() == nullptr)
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
-        unsigned int num_weights    = 1;
-        unsigned int num_dimensions = in->info()->num_dimensions();
-        // Ignore the batch dimension if there is one:
-        if(num_dimensions == 2 || num_dimensions == 4)
-        {
-            num_dimensions--;
-        }
-        for(unsigned int i = 0; i < num_dimensions; i++)
-        {
-            num_weights *= in->info()->dimension(i);
-        }
-        _weights.set_info(TensorInfo(TensorShape(num_weights, _num_neurons), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
     }
-    if(_biases.tensor() == nullptr)
-    {
-        _biases.set_info(TensorInfo(TensorShape(_num_neurons), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
-    }
-
-    // Auto configure output
-    arm_compute::auto_init_if_empty(*out->info(),
-                                    calculate_fullyconnected_layer_output_shape(in->info()->tensor_shape(), _num_neurons),
-                                    in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position());
-
-    bool weights_are_loaded = _weights.tensor() != nullptr;
-    bool biases_are_loaded  = _biases.tensor() != nullptr;
-
-    // Create node context
-    NodeContext node_ctx(OperationType::FullyConnectedLayer);
-    node_ctx.set_target(_target_hint);
-    node_ctx.add_input(in);
-    node_ctx.add_input(_weights.set_target(_target_hint));
-    node_ctx.add_input(_biases.set_target(_target_hint));
-    node_ctx.add_output(out);
-
-    // Configure operation
-    auto func = OperationRegistry::get().find_operation(OperationType::FullyConnectedLayer, _target_hint)->configure(node_ctx);
-
-    // Fill biases
-    if(!weights_are_loaded)
-    {
-        _weights.allocate_and_fill_if_needed();
-    }
-    if(!biases_are_loaded)
-    {
-        _biases.allocate_and_fill_if_needed();
-    }
-
-    // Get function
-    return func;
+    return false;
 }
+
+TensorDescriptor FullyConnectedLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    return compute_output_descriptor(src->desc(), _num_outputs);
+}
+
+NodeType FullyConnectedLayerNode::type() const
+{
+    return NodeType::FullyConnectedLayer;
+}
+
+void FullyConnectedLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/nodes/InputNode.cpp b/src/graph/nodes/InputNode.cpp
new file mode 100644
index 0000000..709eaae
--- /dev/null
+++ b/src/graph/nodes/InputNode.cpp

@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/InputNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+InputNode::InputNode(TensorDescriptor desc)
+    : _desc(std::move(desc))
+{
+    _outputs.resize(1, NullTensorID);
+}
+
+bool InputNode::forward_descriptors()
+{
+    if(output_id(0) != NullTensorID)
+    {
+        Tensor *t = output(0);
+        ARM_COMPUTE_ERROR_ON(t == nullptr);
+        t->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor InputNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    return _desc;
+}
+
+NodeType InputNode::type() const
+{
+    return NodeType::Input;
+}
+
+void InputNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/nodes/L2NormalizeLayer.cpp b/src/graph/nodes/L2NormalizeLayer.cpp
deleted file mode 100644
index 9813ba4..0000000
--- a/src/graph/nodes/L2NormalizeLayer.cpp
+++ /dev/null

@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/L2NormalizeLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-L2NormalizeLayer::L2NormalizeLayer(unsigned int axis, float epsilon)
-    : _axis(axis), _epsilon(epsilon)
-{
-}
-
-std::unique_ptr<arm_compute::IFunction> L2NormalizeLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
-    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
-    arm_compute::ITensor *in  = input->tensor();
-    arm_compute::ITensor *out = output->tensor();
-    _target_hint              = ctx.hints().target_hint();
-
-    // Create node context
-    NodeContext node_ctx(OperationType::L2NormalizeLayer);
-    node_ctx.set_target(_target_hint);
-    node_ctx.add_input(in);
-    node_ctx.add_output(out);
-    node_ctx.add_parameter<unsigned int>("axis", _axis);
-    node_ctx.add_parameter<float>("epsilon", _epsilon);
-
-    // Get function
-    return OperationRegistry::get().find_operation(OperationType::L2NormalizeLayer, _target_hint)->configure(node_ctx);
-}

diff --git a/src/graph/nodes/NormalizationLayer.cpp b/src/graph/nodes/NormalizationLayer.cpp
deleted file mode 100644
index a489329..0000000
--- a/src/graph/nodes/NormalizationLayer.cpp
+++ /dev/null

@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/NormalizationLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-NormalizationLayer::NormalizationLayer(const NormalizationLayerInfo norm_info)
-    : _norm_info(norm_info)
-{
-}
-
-std::unique_ptr<arm_compute::IFunction> NormalizationLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
-    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
-    arm_compute::ITensor *in  = input->tensor();
-    arm_compute::ITensor *out = output->tensor();
-    _target_hint              = ctx.hints().target_hint();
-
-    // Create node context
-    NodeContext node_ctx(OperationType::NormalizationLayer);
-    node_ctx.set_target(_target_hint);
-    node_ctx.add_input(in);
-    node_ctx.add_output(out);
-    node_ctx.add_parameter<NormalizationLayerInfo>("NormalizationLayerInfo", _norm_info);
-
-    // Get function
-    return OperationRegistry::get().find_operation(OperationType::NormalizationLayer, _target_hint)->configure(node_ctx);
-}

diff --git a/src/graph/nodes/NormalizationLayerNode.cpp b/src/graph/nodes/NormalizationLayerNode.cpp
new file mode 100644
index 0000000..a7b3738
--- /dev/null
+++ b/src/graph/nodes/NormalizationLayerNode.cpp

@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/NormalizationLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+NormalizationLayerNode::NormalizationLayerNode(NormalizationLayerInfo norm_info)
+    : _info(norm_info)
+{
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+NormalizationLayerInfo NormalizationLayerNode::normalization_info() const
+{
+    return _info;
+}
+
+bool NormalizationLayerNode::forward_descriptors()
+{
+    if(input_id(0) != NullTensorID && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor NormalizationLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    return src->desc();
+}
+
+NodeType NormalizationLayerNode::type() const
+{
+    return NodeType::NormalizationLayer;
+}
+
+void NormalizationLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/CL/CLMap.cpp b/src/graph/nodes/OutputNode.cpp
similarity index 61%
copy from src/graph/CL/CLMap.cpp
copy to src/graph/nodes/OutputNode.cpp
index 5289ea9..8aa249b 100644
--- a/src/graph/CL/CLMap.cpp
+++ b/src/graph/nodes/OutputNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,23 +21,41 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/CL/CLMap.h"
+#include "arm_compute/graph/nodes/OutputNode.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Tensor.h"
 
-using namespace arm_compute::graph;
-
-CLMap::CLMap(ITensorObject *tensor, bool blocking)
-    : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor())), _blocking(blocking)
+namespace arm_compute
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
+namespace graph
+{
+OutputNode::OutputNode()
+{
+    _input_edges.resize(1, EmptyEdgeID);
 }
 
-void CLMap::run()
+bool OutputNode::forward_descriptors()
 {
-    _tensor->map(arm_compute::CLScheduler::get().queue(), _blocking);
+    return true;
 }
+
+TensorDescriptor OutputNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    return TensorDescriptor();
+}
+
+NodeType OutputNode::type() const
+{
+    return NodeType::Output;
+}
+
+void OutputNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/nodes/PoolingLayer.cpp b/src/graph/nodes/PoolingLayer.cpp
deleted file mode 100644
index 2c15119..0000000
--- a/src/graph/nodes/PoolingLayer.cpp
+++ /dev/null

@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/PoolingLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-PoolingLayer::PoolingLayer(const PoolingLayerInfo pool_info)
-    : _pool_info(pool_info)
-{
-}
-
-std::unique_ptr<arm_compute::IFunction> PoolingLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
-    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
-    arm_compute::ITensor *in  = input->tensor();
-    arm_compute::ITensor *out = output->tensor();
-    _target_hint              = ctx.hints().target_hint();
-
-    // Create node context
-    NodeContext node_ctx(OperationType::PoolingLayer);
-    node_ctx.set_target(_target_hint);
-    node_ctx.add_input(in);
-    node_ctx.add_output(out);
-    node_ctx.add_parameter<PoolingLayerInfo>("PoolingLayerInfo", _pool_info);
-
-    // Get function
-    return OperationRegistry::get().find_operation(OperationType::PoolingLayer, _target_hint)->configure(node_ctx);
-}

diff --git a/src/graph/nodes/PoolingLayerNode.cpp b/src/graph/nodes/PoolingLayerNode.cpp
new file mode 100644
index 0000000..26c145a
--- /dev/null
+++ b/src/graph/nodes/PoolingLayerNode.cpp

@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/PoolingLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+PoolingLayerNode::PoolingLayerNode(PoolingLayerInfo pool_info)
+    : _info(std::move(pool_info))
+{
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+PoolingLayerInfo PoolingLayerNode::pooling_info() const
+{
+    return _info;
+}
+
+TensorDescriptor PoolingLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                             PoolingLayerInfo        info)
+{
+    unsigned int pooled_width  = 0;
+    unsigned int pooled_height = 0;
+
+    const unsigned int input_width  = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
+    const unsigned int input_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+    const unsigned int pool_size_x  = info.is_global_pooling() ? input_width : info.pool_size().width;
+    const unsigned int pool_size_y  = info.is_global_pooling() ? input_height : info.pool_size().height;
+
+    std::tie(pooled_width, pooled_height) = scaled_dimensions(input_width, input_height, pool_size_x, pool_size_y, info.pad_stride_info());
+
+    TensorDescriptor output_descriptor = input_descriptor;
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), pooled_width);
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), pooled_height);
+
+    return output_descriptor;
+}
+
+bool PoolingLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor PoolingLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    return compute_output_descriptor(src->desc(), _info);
+}
+
+NodeType PoolingLayerNode::type() const
+{
+    return NodeType::PoolingLayer;
+}
+
+void PoolingLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/nodes/QuantizationLayer.cpp b/src/graph/nodes/QuantizationLayer.cpp
deleted file mode 100644
index c102f47..0000000
--- a/src/graph/nodes/QuantizationLayer.cpp
+++ /dev/null

@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/QuantizationLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-
-using namespace arm_compute::graph;
-
-std::unique_ptr<arm_compute::IFunction> QuantizationLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
-    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
-    _target_hint              = ctx.hints().target_hint();
-    arm_compute::ITensor *in  = input->tensor();
-    arm_compute::ITensor *out = output->tensor();
-
-    // Create node context
-    NodeContext node_ctx(OperationType::QuantizationLayer);
-    node_ctx.set_target(_target_hint);
-    node_ctx.add_input(in);
-    node_ctx.add_output(out);
-
-    // Get function
-    return OperationRegistry::get().find_operation(OperationType::QuantizationLayer, _target_hint)->configure(node_ctx);
-}

diff --git a/src/graph/nodes/ReshapeLayer.cpp b/src/graph/nodes/ReshapeLayer.cpp
index b0c117e..58610e9 100644
--- a/src/graph/nodes/ReshapeLayer.cpp
+++ b/src/graph/nodes/ReshapeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,37 +21,56 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/nodes/ReshapeLayer.h"
+#include "arm_compute/graph/nodes/ReshapeLayerNode.h"
 
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
 
-using namespace arm_compute::graph;
-
-ReshapeLayer::ReshapeLayer(TensorShape shape)
+namespace arm_compute
+{
+namespace graph
+{
+ReshapeLayerNode::ReshapeLayerNode(TensorShape shape)
     : _shape(shape)
 {
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
 }
 
-std::unique_ptr<arm_compute::IFunction> ReshapeLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
+bool ReshapeLayerNode::forward_descriptors()
 {
-    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
-    _target_hint              = ctx.hints().target_hint();
-    arm_compute::ITensor *in  = input->tensor();
-    arm_compute::ITensor *out = output->tensor();
-
-    // Auto configure output
-    arm_compute::auto_init_if_empty(*out->info(), _shape, 1, in->info()->data_type(), in->info()->fixed_point_position(), in->info()->quantization_info());
-
-    // Create node context
-    NodeContext node_ctx(OperationType::ReshapeLayer);
-    node_ctx.set_target(_target_hint);
-    node_ctx.add_input(in);
-    node_ctx.add_output(out);
-
-    // Get function
-    return OperationRegistry::get().find_operation(OperationType::ReshapeLayer, _target_hint)->configure(node_ctx);
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
 }
+
+TensorDescriptor ReshapeLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    TensorDescriptor output_desc = src->desc();
+    output_desc.shape            = _shape;
+
+    return output_desc;
+}
+
+NodeType ReshapeLayerNode::type() const
+{
+    return NodeType::ReshapeLayer;
+}
+
+void ReshapeLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/nodes/ResidualLayer.cpp b/src/graph/nodes/ResidualLayer.cpp
deleted file mode 100644
index 87404f9..0000000
--- a/src/graph/nodes/ResidualLayer.cpp
+++ /dev/null

@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/ResidualLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "arm_compute/graph/SubGraph.h"
-#include "arm_compute/graph/Tensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "support/ToolchainSupport.h"
-#include "utils/Utils.h"
-
-#include <memory>
-#include <tuple>
-#include <vector>
-
-using namespace arm_compute::graph;
-
-/** Residual function */
-class ResidualFunction final : public arm_compute::IFunction
-{
-public:
-    /** Default Constructor */
-    ResidualFunction(GraphContext &ctx, ITensorObject *output)
-        : _ctx(ctx), _input(nullptr), _output(output), _func(nullptr), _graphs(), _graph_outputs()
-    {
-    }
-
-    /** Prevent instances from being copy constructed */
-    ResidualFunction(const ResidualFunction &) = delete;
-    /** Prevent instances from being copy assigned */
-    const ResidualFunction &operator=(const ResidualFunction &) = delete;
-    /** Prevent instances from being move constructed */
-    ResidualFunction(ResidualFunction &&) = delete;
-    /** Prevent instances from being move assigned */
-    ResidualFunction &operator=(ResidualFunction &&) = delete;
-    /** Default destructor */
-    ~ResidualFunction() override = default;
-
-    /** Set the input (when using only one sub graph)
-     *
-     * @param[in] input Input to set
-     */
-    void set_input(std::unique_ptr<ITensorObject> input)
-    {
-        _input = std::move(input);
-    }
-
-    /** Registers graph to be executed by the residual function
-     *
-     * @param[in] graph  Graph to register
-     * @param[in] output Output to register
-     */
-    void register_graph(std::unique_ptr<Graph> graph, std::unique_ptr<ITensorObject> output)
-    {
-        _graphs.push_back(std::move(graph));
-        _graph_outputs.push_back(std::move(output));
-    }
-
-    /** Configure the function */
-    void configure()
-    {
-        ARM_COMPUTE_ERROR_ON(_graphs.size() < 1 || _graphs.size() > 2);
-        TargetHint target_hint = _ctx.hints().target_hint();
-
-        // Create node context
-        NodeContext node_ctx(OperationType::ArithmeticAddition);
-        node_ctx.set_target(target_hint);
-
-        if(_graphs.size() == 1)
-        {
-            arm_compute::ITensor *in = _input->tensor();
-            node_ctx.add_input(in);
-        }
-
-        for(auto &o : _graph_outputs)
-        {
-            arm_compute::ITensor *in = o->tensor();
-            node_ctx.add_input(in);
-        }
-
-        arm_compute::ITensor *out = _output->tensor();
-        auto_init_if_empty(*out->info(), *_graph_outputs[0]->tensor()->info());
-        node_ctx.add_output(out);
-
-        _func = OperationRegistry::get().find_operation(OperationType::ArithmeticAddition, target_hint)->configure(node_ctx);
-
-        for(auto &o : _graph_outputs)
-        {
-            o->allocate();
-        }
-    }
-
-    // Inherited methods overriden:
-    void run() override
-    {
-        ARM_COMPUTE_ERROR_ON(_graphs.size() < 1 || _graphs.size() > 2);
-
-        for(auto &g : _graphs)
-        {
-            ARM_COMPUTE_ERROR_ON(g.get() == nullptr);
-            g->run();
-        }
-
-        _func->run();
-    }
-
-private:
-    GraphContext                                _ctx;
-    std::unique_ptr<ITensorObject>              _input;
-    ITensorObject                              *_output;
-    std::unique_ptr<arm_compute::IFunction>     _func;
-    std::vector<std::unique_ptr<Graph>>         _graphs;
-    std::vector<std::unique_ptr<ITensorObject>> _graph_outputs;
-};
-
-std::unique_ptr<arm_compute::IFunction> ResidualLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
-    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<Tensor *>(input) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<Tensor *>(output) == nullptr);
-
-    // Create residual function
-    auto func = arm_compute::support::cpp14::make_unique<ResidualFunction>(ctx, output);
-
-    if(_sub_graphs.size() == 1)
-    {
-        std::unique_ptr<ITensorObject> original_in;
-        original_in = arm_compute::support::cpp14::make_unique<SubTensor>(*dynamic_cast<Tensor *>(input),
-                                                                          input->tensor()->info()->tensor_shape(),
-                                                                          Coordinates());
-        func->set_input(std::move(original_in));
-    }
-
-    // Constuct all sub-graphs given the input/output
-    for(auto &sg : _sub_graphs)
-    {
-        ARM_COMPUTE_ERROR_ON(sg.get() == nullptr);
-
-        // IO buffers
-        std::unique_ptr<ITensorObject> in;
-        std::unique_ptr<ITensorObject> out;
-        std::unique_ptr<ITensorObject> func_in;
-
-        // Create input sub-tensor
-        if(!sg->has_input())
-        {
-            in = arm_compute::support::cpp14::make_unique<SubTensor>(*dynamic_cast<Tensor *>(input),
-                                                                     input->tensor()->info()->tensor_shape(),
-                                                                     Coordinates());
-        }
-
-        // Create output sub-tensor
-        if(!sg->has_output())
-        {
-            ITensorInfo *info = input->tensor()->info();
-            func_in           = arm_compute::support::cpp14::make_unique<Tensor>(TensorInfo(info->num_channels(), info->data_type(), info->fixed_point_position()));
-            func_in->set_target(ctx.hints().target_hint());
-            out = arm_compute::support::cpp14::make_unique<SubTensor>(func_in->tensor(),
-                                                                      TensorShape(),
-                                                                      Coordinates(0, 0, 0),
-                                                                      func_in->target(),
-                                                                      true);
-        }
-
-        // Construct sub_graph
-        auto g = sg->construct(ctx, std::move(in), std::move(out));
-
-        // Register graph to function
-        func->register_graph(std::move(g), std::move(func_in));
-    }
-
-    func->configure();
-
-    return std::move(func);
-}

diff --git a/src/graph/nodes/SoftmaxLayer.cpp b/src/graph/nodes/SoftmaxLayer.cpp
deleted file mode 100644
index 7f2325b..0000000
--- a/src/graph/nodes/SoftmaxLayer.cpp
+++ /dev/null

@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/nodes/SoftmaxLayer.h"
-
-#include "arm_compute/graph/Error.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistry.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute::graph;
-
-std::unique_ptr<arm_compute::IFunction> SoftmaxLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
-{
-    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
-
-    arm_compute::ITensor *in  = input->tensor();
-    arm_compute::ITensor *out = output->tensor();
-    _target_hint              = ctx.hints().target_hint();
-
-    // Create node context
-    NodeContext node_ctx(OperationType::SoftmaxLayer);
-    node_ctx.set_target(_target_hint);
-    node_ctx.add_input(in);
-    node_ctx.add_output(out);
-
-    // Get function
-    return OperationRegistry::get().find_operation(OperationType::SoftmaxLayer, _target_hint)->configure(node_ctx);
-}

diff --git a/src/graph/nodes/SoftmaxLayerNode.cpp b/src/graph/nodes/SoftmaxLayerNode.cpp
new file mode 100644
index 0000000..57e5561
--- /dev/null
+++ b/src/graph/nodes/SoftmaxLayerNode.cpp

@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/SoftmaxLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+SoftmaxLayerNode::SoftmaxLayerNode(float beta)
+    : _beta(beta)
+{
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+float SoftmaxLayerNode::beta() const
+{
+    return _beta;
+}
+
+bool SoftmaxLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor SoftmaxLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    TensorDescriptor out_desc = src->desc();
+    out_desc.quant_info       = QuantizationInfo(1.f / 256.f, 0);
+
+    return out_desc;
+}
+
+NodeType SoftmaxLayerNode::type() const
+{
+    return NodeType::SoftmaxLayer;
+}
+
+void SoftmaxLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/nodes/SplitLayerNode.cpp b/src/graph/nodes/SplitLayerNode.cpp
new file mode 100644
index 0000000..5d46c9d
--- /dev/null
+++ b/src/graph/nodes/SplitLayerNode.cpp

@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/SplitLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+SplitLayerNode::SplitLayerNode(unsigned int num_splits, unsigned int axis)
+    : _num_splits(num_splits), _axis(axis)
+{
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(num_splits, NullTensorID);
+}
+
+unsigned int SplitLayerNode::num_splits() const
+{
+    return _num_splits;
+}
+
+unsigned int SplitLayerNode::axis() const
+{
+    return _axis;
+}
+
+std::pair<TensorDescriptor, Coordinates> SplitLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                                                   unsigned int num_splits, unsigned int axis, unsigned int idx)
+{
+    const unsigned int split_size = input_descriptor.shape[axis] / num_splits;
+
+    TensorDescriptor output_descriptor = input_descriptor;
+    output_descriptor.shape.set(axis, split_size);
+
+    Coordinates coords;
+    coords.set(axis, idx * split_size);
+
+    return std::make_pair(output_descriptor, coords);
+}
+
+bool SplitLayerNode::forward_descriptors()
+{
+    if(input_id(0) != NullTensorID)
+    {
+        validate();
+        for(unsigned int i = 0; i < _outputs.size(); ++i)
+        {
+            if(output_id(i) != NullTensorID)
+            {
+                Tensor *dst_i = output(i);
+                ARM_COMPUTE_ERROR_ON(dst_i == nullptr);
+                dst_i->desc() = configure_output(i);
+            }
+        }
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor SplitLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    TensorDescriptor output_info;
+    std::tie(output_info, std::ignore) = compute_output_descriptor(src->desc(), _num_splits, _axis, idx);
+
+    return output_info;
+}
+
+Status SplitLayerNode::validate() const
+{
+    const Tensor *src = input(0);
+    ARM_COMPUTE_RETURN_ERROR_ON(src == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(_axis >= src->desc().shape.num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->desc().shape[_axis] % _num_splits, "Split should be exact");
+
+    return Status{};
+}
+
+NodeType SplitLayerNode::type() const
+{
+    return NodeType::SplitLayer;
+}
+
+void SplitLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/operations/CLSimpleOperations.cpp b/src/graph/operations/CLSimpleOperations.cpp
deleted file mode 100644
index fe56122..0000000
--- a/src/graph/operations/CLSimpleOperations.cpp
+++ /dev/null

@@ -1,495 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/graph/IOperation.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistrar.h"
-#include "arm_compute/graph/Types.h"
-#include "arm_compute/runtime/CL/CLFunctions.h"
-#include "support/ToolchainSupport.h"
-#include "utils/GraphTypePrinter.h"
-#include "utils/TypePrinter.h"
-
-#include <memory>
-
-using namespace arm_compute::graph;
-
-/* Activation Layer */
-REGISTER_SIMPLE_OPERATION(CLActivationLayerOperation, OPENCL, OperationType::ActivationLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto      *in       = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
-    auto      *out      = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-    const auto act_info = ctx.parameter<ActivationLayerInfo>("ActivationLayerInfo");
-
-    // Create and configure function
-    auto activation = arm_compute::support::cpp14::make_unique<arm_compute::CLActivationLayer>();
-    activation->configure(in, out, act_info);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLActivationLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << " Activation function: " << act_info.activation()
-                               << " a: " << act_info.a()
-                               << " b: " << act_info.b()
-                               << std::endl);
-
-    return std::move(activation);
-}
-
-/* Arithmetic addition */
-REGISTER_SIMPLE_OPERATION(CLArithmeticAdditionOperation, OPENCL, OperationType::ArithmeticAddition)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 2);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto *in1 = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
-    auto *in2 = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1));
-    auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-
-    auto addition = arm_compute::support::cpp14::make_unique<arm_compute::CLArithmeticAddition>();
-    addition->configure(in1, in2, out, ConvertPolicy::SATURATE);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLArithmeticAddition"
-                               << " Data Type: " << in1->info()->data_type()
-                               << " Input 1 shape: " << in1->info()->tensor_shape()
-                               << " Input 2 shape: " << in2->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << std::endl);
-
-    return std::move(addition);
-}
-
-/* Batch Normalization Layer */
-REGISTER_SIMPLE_OPERATION(CLBatchNormalizationLayerOperation, OPENCL, OperationType::BatchNormalizationLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 5);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(2)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(3)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(4)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto      *in       = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
-    auto      *mean     = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1));
-    auto      *var      = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(2));
-    auto      *beta     = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(3));
-    auto      *gamma    = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(4));
-    auto      *out      = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-    const auto epsilon  = ctx.parameter<float>("epsilon");
-    const auto act_info = ctx.parameter<ActivationLayerInfo>("act_info");
-
-    // Create and configure function
-    auto batch_norm = arm_compute::support::cpp14::make_unique<arm_compute::CLBatchNormalizationLayer>();
-    batch_norm->configure(in, out, mean, var, beta, gamma, epsilon, act_info);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLBatchNormalizationLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << " Mean shape: " << mean->info()->tensor_shape()
-                               << " Var shape: " << var->info()->tensor_shape()
-                               << " Beta shape: " << beta->info()->tensor_shape()
-                               << " Gamma shape: " << gamma->info()->tensor_shape()
-                               << " Epsilon: " << epsilon
-                               << " Activation function: " << act_info.activation()
-                               << " a: " << act_info.a()
-                               << " b: " << act_info.b()
-                               << std::endl);
-
-    return std::move(batch_norm);
-}
-
-/* DepthConvertLayer Layer */
-REGISTER_SIMPLE_OPERATION(CLDepthConvertLayerOperation, OPENCL, OperationType::DepthConvertLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto      *in          = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
-    auto      *out         = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-    const auto conv_policy = ctx.parameter<ConvertPolicy>("ConvertPolicy");
-    const auto shift       = ctx.parameter<uint32_t>("shift");
-
-    // Create and configure function
-    auto depthconvert = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthConvertLayer>();
-    depthconvert->configure(in, out, conv_policy, shift);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLDepthConvertLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << " shift: " << shift
-                               << std::endl);
-
-    return std::move(depthconvert);
-}
-
-/* DepthwiseConvolutionLayer Layer */
-REGISTER_SIMPLE_OPERATION(CLDepthwiseConvolutionOperation, OPENCL, OperationType::DepthwiseConvolutionLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 2 && ctx.num_inputs() != 3);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto      *in        = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
-    auto      *weights   = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1));
-    auto      *biases    = ctx.num_inputs() == 3 ? dynamic_cast<arm_compute::ICLTensor *>(ctx.input(2)) : nullptr;
-    auto      *out       = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-    const auto conv_info = ctx.parameter<PadStrideInfo>("ConvolutionInfo");
-    const auto opt3x3    = ctx.parameter<bool>("Optimized3x3");
-
-    // Create and configure function
-    std::unique_ptr<arm_compute::IFunction> func;
-    bool                                    run_3x3_opt = opt3x3 && weights->info()->dimension(0) == 3;
-    if(run_3x3_opt)
-    {
-        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthwiseConvolutionLayer3x3>();
-        depwthwise_conv->configure(in, weights, biases, out, conv_info);
-        func = std::move(depwthwise_conv);
-    }
-    else
-    {
-        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthwiseConvolutionLayer>();
-        depwthwise_conv->configure(in, weights, biases, out, conv_info);
-        func = std::move(depwthwise_conv);
-    }
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLDepthwiseConvolutionLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape());
-    if(biases == nullptr)
-    {
-        ARM_COMPUTE_LOG_GRAPH_INFO(" Biases shape: No biases provided" << std::endl);
-    }
-    else
-    {
-        ARM_COMPUTE_LOG_GRAPH_INFO(" Biases shape: " << biases->info()->tensor_shape() << std::endl);
-    }
-
-    return func;
-}
-
-/* DeQuantizationLayer Layer */
-REGISTER_SIMPLE_OPERATION(CLDequantizationLayerOperation, OPENCL, OperationType::DequantizationLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 2);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(1)) == nullptr);
-
-    // Extract IO and info
-    auto *in      = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
-    auto *out     = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-    auto *min_max = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(1));
-
-    // Create and configure function
-    auto dequantization = arm_compute::support::cpp14::make_unique<arm_compute::CLDequantizationLayer>();
-    dequantization->configure(in, out, min_max);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLDequantizationLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << " Min max shape: " << min_max->info()->tensor_shape()
-                               << std::endl);
-
-    return std::move(dequantization);
-}
-
-/* Flatten Layer */
-REGISTER_SIMPLE_OPERATION(CLFlattenLayerOperation, OPENCL, OperationType::FlattenLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto *in  = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
-    auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-
-    // Create and configure function
-    auto flatten = arm_compute::support::cpp14::make_unique<arm_compute::CLFlattenLayer>();
-    flatten->configure(in, out);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLFlattenLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << std::endl);
-
-    return std::move(flatten);
-}
-
-/* Floor Layer */
-REGISTER_SIMPLE_OPERATION(CLFloorLayerOperation, OPENCL, OperationType::FloorLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto *in  = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
-    auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-
-    // Create and configure function
-    auto floor = arm_compute::support::cpp14::make_unique<arm_compute::CLFloor>();
-    floor->configure(in, out);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLFloorLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << std::endl);
-
-    return std::move(floor);
-}
-
-/* Fully Connected Layer */
-REGISTER_SIMPLE_OPERATION(CLFullyConnectedLayer, OPENCL, OperationType::FullyConnectedLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 3);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(2)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto *in      = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
-    auto *weights = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1));
-    auto *biases  = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(2));
-    auto *out     = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-
-    // Create and configure function
-    auto fc = arm_compute::support::cpp14::make_unique<arm_compute::CLFullyConnectedLayer>();
-    fc->configure(in, weights, biases, out);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLFullyConnectedLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Biases Shape: " << biases->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << std::endl);
-
-    return std::move(fc);
-}
-
-/* L2 Normalize Layer */
-REGISTER_SIMPLE_OPERATION(CLL2NormalizeLayerOperation, OPENCL, OperationType::L2NormalizeLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto      *in      = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
-    auto      *out     = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-    const auto axis    = ctx.parameter<unsigned int>("axis");
-    const auto epsilon = ctx.parameter<float>("epsilon");
-
-    // Create and configure function
-    auto l2_norm = arm_compute::support::cpp14::make_unique<arm_compute::CLL2NormalizeLayer>();
-    l2_norm->configure(in, out, axis, epsilon);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLL2NormalizeLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << " Axis: " << axis
-                               << " Epsilon: " << epsilon
-                               << std::endl);
-
-    return std::move(l2_norm);
-}
-
-/* Normalization Layer */
-REGISTER_SIMPLE_OPERATION(CLNormalizationLayerOperation, OPENCL, OperationType::NormalizationLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto      *in        = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
-    auto      *out       = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-    const auto norm_info = ctx.parameter<NormalizationLayerInfo>("NormalizationLayerInfo");
-
-    // Create and configure function
-    auto norm = arm_compute::support::cpp14::make_unique<arm_compute::CLNormalizationLayer>();
-    norm->configure(in, out, norm_info);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLNormalizationLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << " Normalization info: " << norm_info
-                               << std::endl);
-
-    return std::move(norm);
-}
-
-/* Pooling Layer */
-REGISTER_SIMPLE_OPERATION(CLPoolingLayerOperation, OPENCL, OperationType::PoolingLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto      *in        = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
-    auto      *out       = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-    const auto pool_info = ctx.parameter<PoolingLayerInfo>("PoolingLayerInfo");
-
-    // Create and configure function
-    auto pool = arm_compute::support::cpp14::make_unique<arm_compute::CLPoolingLayer>();
-    pool->configure(in, out, pool_info);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLPoolingLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << " Pooling info: " << pool_info
-                               << std::endl);
-
-    return std::move(pool);
-}
-
-/* Quantization Layer */
-REGISTER_SIMPLE_OPERATION(CLQuantizationLayerOperation, OPENCL, OperationType::QuantizationLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto *in  = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
-    auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-
-    // Create and configure function
-    auto quantization = arm_compute::support::cpp14::make_unique<arm_compute::CLQuantizationLayer>();
-    quantization->configure(in, out);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLQuantizationLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << std::endl);
-
-    return std::move(quantization);
-}
-
-/* Reshape Layer */
-REGISTER_SIMPLE_OPERATION(CLReshapeLayerOperation, OPENCL, OperationType::ReshapeLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto *in  = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
-    auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-
-    // Create and configure function
-    auto reshape = arm_compute::support::cpp14::make_unique<arm_compute::CLReshapeLayer>();
-    reshape->configure(in, out);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLReshapeLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << std::endl);
-
-    return std::move(reshape);
-}
-
-/* Softmax Layer */
-REGISTER_SIMPLE_OPERATION(CLSoftmaxLayerOperation, OPENCL, OperationType::SoftmaxLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto *in  = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
-    auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
-
-    // Create and configure function
-    auto smx = arm_compute::support::cpp14::make_unique<arm_compute::CLSoftmaxLayer>();
-    smx->configure(in, out);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLSoftmaxLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << std::endl);
-
-    return std::move(smx);
-}

diff --git a/src/graph/operations/NESimpleOperations.cpp b/src/graph/operations/NESimpleOperations.cpp
deleted file mode 100644
index 4154b9a..0000000
--- a/src/graph/operations/NESimpleOperations.cpp
+++ /dev/null

@@ -1,495 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/graph/IOperation.h"
-#include "arm_compute/graph/NodeContext.h"
-#include "arm_compute/graph/OperationRegistrar.h"
-#include "arm_compute/graph/Types.h"
-#include "arm_compute/runtime/NEON/NEFunctions.h"
-#include "support/ToolchainSupport.h"
-#include "utils/GraphTypePrinter.h"
-#include "utils/TypePrinter.h"
-
-#include <memory>
-
-using namespace arm_compute::graph;
-
-/* Activation Layer */
-REGISTER_SIMPLE_OPERATION(NEActivationLayerOperation, NEON, OperationType::ActivationLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto      *in       = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
-    auto      *out      = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-    const auto act_info = ctx.parameter<ActivationLayerInfo>("ActivationLayerInfo");
-
-    // Create and configure function
-    auto activation = arm_compute::support::cpp14::make_unique<arm_compute::NEActivationLayer>();
-    activation->configure(in, out, act_info);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEActivationLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << " Activation function: " << act_info.activation()
-                               << " a: " << act_info.a()
-                               << " b: " << act_info.b()
-                               << std::endl);
-
-    return std::move(activation);
-}
-
-/* Arithmetic addition */
-REGISTER_SIMPLE_OPERATION(NEArithmeticAdditionOperation, NEON, OperationType::ArithmeticAddition)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 2);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(1)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto *in1 = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
-    auto *in2 = dynamic_cast<arm_compute::ITensor *>(ctx.input(1));
-    auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-
-    auto addition = arm_compute::support::cpp14::make_unique<arm_compute::NEArithmeticAddition>();
-    addition->configure(in1, in2, out, ConvertPolicy::SATURATE);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEArithmeticAddition"
-                               << " Data Type: " << in1->info()->data_type()
-                               << " Input 1 shape: " << in1->info()->tensor_shape()
-                               << " Input 2 shape: " << in2->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << std::endl);
-
-    return std::move(addition);
-}
-
-/* Batch Normalization Layer */
-REGISTER_SIMPLE_OPERATION(NEBatchNormalizationLayerOperation, NEON, OperationType::BatchNormalizationLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 5);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(1)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(2)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(3)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(4)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto      *in       = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
-    auto      *mean     = dynamic_cast<arm_compute::ITensor *>(ctx.input(1));
-    auto      *var      = dynamic_cast<arm_compute::ITensor *>(ctx.input(2));
-    auto      *beta     = dynamic_cast<arm_compute::ITensor *>(ctx.input(3));
-    auto      *gamma    = dynamic_cast<arm_compute::ITensor *>(ctx.input(4));
-    auto      *out      = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-    const auto epsilon  = ctx.parameter<float>("epsilon");
-    const auto act_info = ctx.parameter<ActivationLayerInfo>("act_info");
-
-    // Create and configure function
-    auto batch_norm = arm_compute::support::cpp14::make_unique<arm_compute::NEBatchNormalizationLayer>();
-    batch_norm->configure(in, out, mean, var, beta, gamma, epsilon, act_info);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEBatchNormalizationLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << " Mean shape: " << mean->info()->tensor_shape()
-                               << " Var shape: " << var->info()->tensor_shape()
-                               << " Beta shape: " << beta->info()->tensor_shape()
-                               << " Gamma shape: " << gamma->info()->tensor_shape()
-                               << " Epsilon: " << epsilon
-                               << " Activation function: " << act_info.activation()
-                               << " a: " << act_info.a()
-                               << " b: " << act_info.b()
-                               << std::endl);
-
-    return std::move(batch_norm);
-}
-
-/* DepthConvertLayer Layer */
-REGISTER_SIMPLE_OPERATION(NEDepthConvertLayerOperation, NEON, OperationType::DepthConvertLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto      *in          = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
-    auto      *out         = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-    const auto conv_policy = ctx.parameter<ConvertPolicy>("ConvertPolicy");
-    const auto shift       = ctx.parameter<uint32_t>("shift");
-
-    // Create and configure function
-    auto depthconvert = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthConvertLayer>();
-    depthconvert->configure(in, out, conv_policy, shift);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEDepthConvertLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << " shift: " << shift
-                               << std::endl);
-
-    return std::move(depthconvert);
-}
-
-/* DepthwiseConvolutionLayer Layer */
-REGISTER_SIMPLE_OPERATION(NEDepthwiseConvolutionOperation, NEON, OperationType::DepthwiseConvolutionLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 2 && ctx.num_inputs() != 3);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto      *in        = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
-    auto      *weights   = dynamic_cast<arm_compute::ITensor *>(ctx.input(1));
-    auto      *biases    = ctx.num_inputs() == 3 ? dynamic_cast<arm_compute::ITensor *>(ctx.input(2)) : nullptr;
-    auto      *out       = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-    const auto conv_info = ctx.parameter<PadStrideInfo>("ConvolutionInfo");
-    const auto opt3x3    = ctx.parameter<bool>("Optimized3x3");
-
-    // Create and configure function
-    std::unique_ptr<arm_compute::IFunction> func;
-    bool                                    run_3x3_opt = opt3x3 && weights->info()->dimension(0) == 3;
-    if(run_3x3_opt)
-    {
-        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthwiseConvolutionLayer3x3>();
-        depwthwise_conv->configure(in, weights, biases, out, conv_info);
-        func = std::move(depwthwise_conv);
-    }
-    else
-    {
-        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthwiseConvolutionLayer>();
-        depwthwise_conv->configure(in, weights, biases, out, conv_info);
-        func = std::move(depwthwise_conv);
-    }
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEDepthwiseConvolutionLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape());
-    if(biases == nullptr)
-    {
-        ARM_COMPUTE_LOG_GRAPH_INFO(" Biases shape: No biases provided" << std::endl);
-    }
-    else
-    {
-        ARM_COMPUTE_LOG_GRAPH_INFO(" Biases shape: " << biases->info()->tensor_shape() << std::endl);
-    }
-
-    return func;
-}
-
-/* DeQuantizationLayer Layer */
-REGISTER_SIMPLE_OPERATION(NEDequantizationLayerOperation, NEON, OperationType::DequantizationLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 2);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(1)) == nullptr);
-
-    // Extract IO and info
-    auto *in      = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
-    auto *out     = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-    auto *min_max = dynamic_cast<arm_compute::ITensor *>(ctx.output(1));
-
-    // Create and configure function
-    auto dequantization = arm_compute::support::cpp14::make_unique<arm_compute::NEDequantizationLayer>();
-    dequantization->configure(in, out, min_max);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEDequantizationLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << " Min max shape: " << min_max->info()->tensor_shape()
-                               << std::endl);
-
-    return std::move(dequantization);
-}
-
-/* Flatten Layer */
-REGISTER_SIMPLE_OPERATION(NEFlattenLayerOperation, NEON, OperationType::FlattenLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto *in  = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
-    auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-
-    // Create and configure function
-    auto flatten = arm_compute::support::cpp14::make_unique<arm_compute::NEFlattenLayer>();
-    flatten->configure(in, out);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEFlattenLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << std::endl);
-
-    return std::move(flatten);
-}
-
-/* Floor Layer */
-REGISTER_SIMPLE_OPERATION(NEFloorLayerOperation, NEON, OperationType::FloorLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto *in  = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
-    auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-
-    // Create and configure function
-    auto floor = arm_compute::support::cpp14::make_unique<arm_compute::NEFloor>();
-    floor->configure(in, out);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEFloorLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << std::endl);
-
-    return std::move(floor);
-}
-
-/* Fully Connected Layer */
-REGISTER_SIMPLE_OPERATION(NEFullyConnectedLayer, NEON, OperationType::FullyConnectedLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 3);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(1)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(2)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto *in      = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
-    auto *weights = dynamic_cast<arm_compute::ITensor *>(ctx.input(1));
-    auto *biases  = dynamic_cast<arm_compute::ITensor *>(ctx.input(2));
-    auto *out     = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-
-    // Create and configure function
-    auto fc = arm_compute::support::cpp14::make_unique<arm_compute::NEFullyConnectedLayer>();
-    fc->configure(in, weights, biases, out);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEFullyConnectedLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Biases Shape: " << biases->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << std::endl);
-
-    return std::move(fc);
-}
-
-/* L2 Normalize Layer */
-REGISTER_SIMPLE_OPERATION(NEL2NormalizeLayerOperation, NEON, OperationType::L2NormalizeLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto      *in      = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
-    auto      *out     = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-    const auto axis    = ctx.parameter<unsigned int>("axis");
-    const auto epsilon = ctx.parameter<float>("epsilon");
-
-    // Create and configure function
-    auto l2_norm = arm_compute::support::cpp14::make_unique<arm_compute::NEL2NormalizeLayer>();
-    l2_norm->configure(in, out, axis, epsilon);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEL2NormalizeLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << " Axis: " << axis
-                               << " Epsilon: " << epsilon
-                               << std::endl);
-
-    return std::move(l2_norm);
-}
-
-/* Normalization Layer */
-REGISTER_SIMPLE_OPERATION(NENormalizationLayerOperation, NEON, OperationType::NormalizationLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto      *in        = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
-    auto      *out       = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-    const auto norm_info = ctx.parameter<NormalizationLayerInfo>("NormalizationLayerInfo");
-
-    // Create and configure function
-    auto norm = arm_compute::support::cpp14::make_unique<arm_compute::NENormalizationLayer>();
-    norm->configure(in, out, norm_info);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NENormalizationLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << " Normalization info: " << norm_info
-                               << std::endl);
-
-    return std::move(norm);
-}
-
-/* Pooling Layer */
-REGISTER_SIMPLE_OPERATION(NEPoolingLayerOperation, NEON, OperationType::PoolingLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto      *in        = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
-    auto      *out       = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-    const auto pool_info = ctx.parameter<PoolingLayerInfo>("PoolingLayerInfo");
-
-    // Create and configure function
-    auto pool = arm_compute::support::cpp14::make_unique<arm_compute::NEPoolingLayer>();
-    pool->configure(in, out, pool_info);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEPoolingLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << " Pooling info: " << pool_info
-                               << std::endl);
-
-    return std::move(pool);
-}
-
-/* Quantization Layer */
-REGISTER_SIMPLE_OPERATION(NEQuantizationLayerOperation, NEON, OperationType::QuantizationLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto *in  = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
-    auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-
-    // Create and configure function
-    auto quantization = arm_compute::support::cpp14::make_unique<arm_compute::NEQuantizationLayer>();
-    quantization->configure(in, out);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEQuantizationLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << std::endl);
-
-    return std::move(quantization);
-}
-
-/* Reshape Layer */
-REGISTER_SIMPLE_OPERATION(NEReshapeLayerOperation, NEON, OperationType::ReshapeLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto *in  = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
-    auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-
-    // Create and configure function
-    auto reshape = arm_compute::support::cpp14::make_unique<arm_compute::NEReshapeLayer>();
-    reshape->configure(in, out);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEReshapeLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << std::endl);
-
-    return std::move(reshape);
-}
-
-/* Softmax Layer */
-REGISTER_SIMPLE_OPERATION(NESoftmaxLayerOperation, NEON, OperationType::SoftmaxLayer)
-{
-    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
-    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
-
-    // Extract IO and info
-    auto *in  = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
-    auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
-
-    // Create and configure function
-    auto smx = arm_compute::support::cpp14::make_unique<arm_compute::NESoftmaxLayer>();
-    smx->configure(in, out);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NESoftmaxLayer"
-                               << " Data Type: " << in->info()->data_type()
-                               << " Input shape: " << in->info()->tensor_shape()
-                               << " Output shape: " << out->info()->tensor_shape()
-                               << std::endl);
-
-    return std::move(smx);
-}

diff --git a/src/graph/printers/DotGraphPrinter.cpp b/src/graph/printers/DotGraphPrinter.cpp
new file mode 100644
index 0000000..61cf423
--- /dev/null
+++ b/src/graph/printers/DotGraphPrinter.cpp

@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/printers/DotGraphPrinter.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/TypePrinter.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+void DotGraphVisitor::visit(ActivationLayerNode &n)
+{
+    std::stringstream ss;
+    ss << n.activation_info().activation();
+    _info = ss.str();
+}
+
+void DotGraphVisitor::visit(BatchNormalizationLayerNode &n)
+{
+    std::stringstream ss;
+    ss << (n.fused_activation().enabled() ? to_string(n.fused_activation().activation()) : "");
+    _info = ss.str();
+}
+
+void DotGraphVisitor::visit(ConvolutionLayerNode &n)
+{
+    std::stringstream ss;
+    ss << n.convolution_method();
+    _info = ss.str();
+}
+
+void DotGraphVisitor::visit(DepthConcatenateLayerNode &n)
+{
+    std::stringstream ss;
+    ss << "Enabled: " << n.is_enabled();
+    _info = ss.str();
+}
+
+void DotGraphVisitor::visit(DepthwiseConvolutionLayerNode &n)
+{
+    std::stringstream ss;
+    ss << n.depthwise_convolution_method();
+    _info = ss.str();
+}
+
+void DotGraphVisitor::visit(EltwiseLayerNode &n)
+{
+    std::stringstream ss;
+    ss << n.eltwise_operation();
+    _info = ss.str();
+}
+
+void DotGraphVisitor::visit(NormalizationLayerNode &n)
+{
+    std::stringstream ss;
+    ss << n.normalization_info().type();
+    _info = ss.str();
+}
+
+void DotGraphVisitor::visit(PoolingLayerNode &n)
+{
+    std::stringstream ss;
+    ss << n.pooling_info().pool_type();
+    ss << R"( \n )";
+    ss << n.pooling_info().pool_size();
+    ss << R"( \n )";
+    ss << n.pooling_info().pad_stride_info();
+    _info = ss.str();
+}
+
+void DotGraphVisitor::default_visit()
+{
+    _info.clear();
+}
+
+const std::string &DotGraphVisitor::info() const
+{
+    return _info;
+}
+
+void DotGraphPrinter::print(const Graph &g, std::ostream &os)
+{
+    // Print header
+    print_header(g, os);
+
+    // Print nodes
+    print_nodes(g, os);
+
+    // Print edges
+    print_edges(g, os);
+
+    // Print footer
+    print_footer(g, os);
+}
+
+void DotGraphPrinter::print_header(const Graph &g, std::ostream &os)
+{
+    // Print graph name
+    std::string graph_name = (g.name().empty()) ? "Graph" : g.name();
+    os << "digraph " << graph_name << "{\n";
+}
+
+void DotGraphPrinter::print_footer(const Graph &g, std::ostream &os)
+{
+    ARM_COMPUTE_UNUSED(g);
+    os << "}\n";
+}
+
+void DotGraphPrinter::print_nodes(const Graph &g, std::ostream &os)
+{
+    for(const auto &n : g.nodes())
+    {
+        if(n)
+        {
+            // Output node id
+            std::string node_id = std::string("n") + support::cpp11::to_string(n->id());
+            os << node_id << " ";
+
+            // Output label
+            n->accept(_dot_node_visitor);
+
+            std::string name             = n->name().empty() ? node_id : n->name();
+            auto        node_description = _dot_node_visitor.info();
+
+            os << R"([label = ")" << name << R"( \n )" << n->assigned_target() << R"( \n )" << node_description << R"("])";
+            os << ";\n";
+        }
+    }
+}
+
+void DotGraphPrinter::print_edges(const Graph &g, std::ostream &os)
+{
+    for(const auto &e : g.edges())
+    {
+        if(e)
+        {
+            std::string source_node_id = std::string("n") + support::cpp11::to_string(e->producer_id());
+            std::string sink_node_id   = std::string("n") + support::cpp11::to_string(e->consumer_id());
+            os << source_node_id << " -> " << sink_node_id << " ";
+            const Tensor *t = e->tensor();
+            ARM_COMPUTE_ERROR_ON(t == nullptr);
+            os << R"([label = ")" << t->desc().shape << R"( \n )" << t->desc().data_type << R"( \n )" << t->desc().layout << R"("])";
+            os << ";\n";
+        }
+    }
+}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp
index 50b0f0e..7f0e374 100644
--- a/src/runtime/Allocator.cpp
+++ b/src/runtime/Allocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/Allocator.h"
+#include "arm_compute/runtime/MemoryRegion.h"
 
 #include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
 
 #include <cstddef>
 
@@ -39,3 +41,9 @@
 {
     ::operator delete(ptr);
 }
+
+std::unique_ptr<IMemoryRegion> Allocator::make_region(size_t size, size_t alignment)
+{
+    ARM_COMPUTE_UNUSED(alignment);
+    return arm_compute::support::cpp14::make_unique<MemoryRegion>(size);
+}
\ No newline at end of file

diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
index 3ca5071..2a4ab6e 100644
--- a/src/runtime/BlobLifetimeManager.cpp
+++ b/src/runtime/BlobLifetimeManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,15 +57,15 @@
     ARM_COMPUTE_ERROR_ON(!are_all_finalized());
     ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
 
-    // Sort active group requirements in descending order.
-    std::sort(std::begin(_active_elements), std::end(_active_elements), [](const Element & a, const Element & b)
+    // Sort free blobs requirements in descending order.
+    _free_blobs.sort([](const Blob & ba, const Blob & bb)
     {
-        return a.size > b.size;
+        return ba.max_size > bb.max_size;
     });
     std::vector<size_t> group_sizes;
-    std::transform(std::begin(_active_elements), std::end(_active_elements), std::back_inserter(group_sizes), [](const Element & e)
+    std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b)
     {
-        return e.size;
+        return b.max_size;
     });
 
     // Update blob sizes
@@ -80,8 +80,14 @@
     // Calculate group mappings
     auto &group_mappings = _active_group->mappings();
     int   blob_idx       = 0;
-    for(auto &e : _active_elements)
+    for(auto &free_blob : _free_blobs)
     {
-        group_mappings[e.handle] = blob_idx++;
+        for(auto &bound_element_id : free_blob.bound_elements)
+        {
+            ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements));
+            Element &bound_element               = _active_elements[bound_element_id];
+            group_mappings[bound_element.handle] = blob_idx;
+        }
+        ++blob_idx;
     }
 }

diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp
index 9a5c13a..84789e7 100644
--- a/src/runtime/CL/CLBufferAllocator.cpp
+++ b/src/runtime/CL/CLBufferAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,9 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/CLBufferAllocator.h"
+#include "arm_compute/runtime/CL/CLMemoryRegion.h"
 
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
 
 #include <cstddef>
 
@@ -47,3 +49,9 @@
     ARM_COMPUTE_ERROR_ON(ptr == nullptr);
     clReleaseMemObject(static_cast<cl_mem>(ptr));
 }
+
+std::unique_ptr<IMemoryRegion> CLBufferAllocator::make_region(size_t size, size_t alignment)
+{
+    ARM_COMPUTE_UNUSED(alignment);
+    return arm_compute::support::cpp14::make_unique<CLBufferMemoryRegion>(_context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
+}

diff --git a/src/runtime/CL/CLHOG.cpp b/src/runtime/CL/CLHOG.cpp
index 3f5266c..c4ea639 100644
--- a/src/runtime/CL/CLHOG.cpp
+++ b/src/runtime/CL/CLHOG.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,11 +74,11 @@
 uint8_t *CLHOG::do_map(cl::CommandQueue &q, bool blocking)
 {
     ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
-    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->descriptor_size()));
+    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info()->descriptor_size() * sizeof(float)));
 }
 
 void CLHOG::do_unmap(cl::CommandQueue &q)
 {
     ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
     q.enqueueUnmapMemObject(_buffer, descriptor());
-}
\ No newline at end of file
+}

diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
new file mode 100644
index 0000000..534c4f9
--- /dev/null
+++ b/src/runtime/CL/CLMemory.cpp

@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMemory.h"
+
+#include "arm_compute/core/Error.h"
+
+namespace arm_compute
+{
+CLMemory::CLMemory()
+    : _region(nullptr), _region_owned(nullptr)
+{
+    create_empty_region();
+}
+
+CLMemory::CLMemory(std::shared_ptr<ICLMemoryRegion> memory)
+    : _region(nullptr), _region_owned(std::move(memory))
+{
+    if(_region_owned == nullptr)
+    {
+        create_empty_region();
+    }
+    _region = _region_owned.get();
+}
+
+CLMemory::CLMemory(ICLMemoryRegion *memory)
+    : _region(memory), _region_owned(nullptr)
+{
+    _region = memory;
+}
+
+ICLMemoryRegion *CLMemory::region()
+{
+    return _region;
+}
+
+ICLMemoryRegion *CLMemory::region() const
+{
+    return _region;
+}
+
+void CLMemory::create_empty_region()
+{
+    _region_owned = std::make_shared<CLBufferMemoryRegion>(cl::Context::getDefault(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 0);
+    _region       = _region_owned.get();
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp
new file mode 100644
index 0000000..15fd7f3
--- /dev/null
+++ b/src/runtime/CL/CLMemoryRegion.cpp

@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLMemoryRegion.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+ICLMemoryRegion::ICLMemoryRegion(cl::Context ctx, size_t size)
+    : IMemoryRegion(size), _ctx(std::move(ctx)), _mapping(nullptr), _mem()
+{
+}
+
+const cl::Buffer &ICLMemoryRegion::cl_data() const
+{
+    return _mem;
+}
+
+void *ICLMemoryRegion::buffer()
+{
+    return _mapping;
+}
+
+void *ICLMemoryRegion::buffer() const
+{
+    return _mapping;
+}
+
+void **ICLMemoryRegion::handle()
+{
+    return reinterpret_cast<void **>(&_mem);
+}
+
+CLBufferMemoryRegion::CLBufferMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size)
+    : ICLMemoryRegion(std::move(ctx), size)
+{
+    if(_size != 0)
+    {
+        _mem = cl::Buffer(_ctx, flags, _size);
+    }
+}
+
+void *CLBufferMemoryRegion::ptr()
+{
+    return nullptr;
+}
+
+void *CLBufferMemoryRegion::map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr);
+    _mapping = q.enqueueMapBuffer(_mem, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, _size);
+    return _mapping;
+}
+
+void CLBufferMemoryRegion::unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_ERROR_ON(_mem.get() == nullptr);
+    q.enqueueUnmapMemObject(_mem, _mapping);
+    _mapping = nullptr;
+}
+
+ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
+    : ICLMemoryRegion(std::move(ctx), size), _ptr(nullptr)
+{
+    if(size != 0)
+    {
+        _ptr = clSVMAlloc(_ctx.get(), flags, size, alignment);
+        if(_ptr != nullptr)
+        {
+            _mem = cl::Buffer(_ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr);
+        }
+    }
+}
+
+ICLSVMMemoryRegion::~ICLSVMMemoryRegion()
+{
+    if(_ptr != nullptr)
+    {
+        clFinish(CLScheduler::get().queue().get());
+        _mem = cl::Buffer();
+        clSVMFree(_ctx.get(), _ptr);
+    }
+}
+
+void *ICLSVMMemoryRegion::ptr()
+{
+    return _ptr;
+}
+
+CLCoarseSVMMemoryRegion::CLCoarseSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
+    : ICLSVMMemoryRegion(std::move(ctx), flags, size, alignment)
+{
+}
+
+void *CLCoarseSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_ptr == nullptr);
+    clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, nullptr);
+    _mapping = _ptr;
+    return _mapping;
+}
+
+void CLCoarseSVMMemoryRegion::unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_ERROR_ON(_ptr == nullptr);
+    clEnqueueSVMUnmap(q.get(), _ptr, 0, nullptr, nullptr);
+    _mapping = nullptr;
+}
+
+CLFineSVMMemoryRegion::CLFineSVMMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size, size_t alignment)
+    : ICLSVMMemoryRegion(std::move(ctx), flags, size, alignment)
+{
+}
+
+void *CLFineSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
+{
+    if(blocking)
+    {
+        clFinish(q.get());
+    }
+    _mapping = _ptr;
+    return _mapping;
+}
+
+void CLFineSVMMemoryRegion::unmap(cl::CommandQueue &q)
+{
+    ARM_COMPUTE_UNUSED(q);
+    _mapping = nullptr;
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 65292fe..fdae615 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp

@@ -31,7 +31,7 @@
 std::once_flag CLScheduler::_initialize_symbols;
 
 CLScheduler::CLScheduler()
-    : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner()
+    : _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner()
 {
 }
 
@@ -52,7 +52,7 @@
     if(_cl_tuner != nullptr)
     {
         // Tune the OpenCL kernel
-        _cl_tuner->tune_kernel(kernel);
+        _cl_tuner->tune_kernel_dynamic(kernel);
     }
 
     // Run kernel

diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
index 5f58024..d0e7d76 100644
--- a/src/runtime/CL/CLSubTensor.cpp
+++ b/src/runtime/CL/CLSubTensor.cpp

@@ -29,6 +29,11 @@
 
 using namespace arm_compute;
 
+CLSubTensor::CLSubTensor()
+    : _parent(nullptr), _info()
+{
+}
+
 CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent)
     : _parent(nullptr), _info()
 {

diff --git a/src/runtime/CL/CLTensor.cpp b/src/runtime/CL/CLTensor.cpp
index bc513d1..dd27738 100644
--- a/src/runtime/CL/CLTensor.cpp
+++ b/src/runtime/CL/CLTensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,7 @@
     return _allocator.cl_data();
 }
 
-ITensorAllocator *CLTensor::allocator()
+CLTensorAllocator *CLTensor::allocator()
 {
     return &_allocator;
 }

diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index ad165fa..54e7c5b 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,36 +30,57 @@
 
 using namespace arm_compute;
 
-CLTensorAllocator::CLTensorAllocator(CLTensor *owner)
-    : _associated_memory_group(nullptr), _buffer(), _mapping(nullptr), _owner(owner)
+namespace
 {
-}
+std::shared_ptr<arm_compute::ICLMemoryRegion> allocate_region(cl::Context context, size_t size, cl_uint alignment)
+{
+    // Try fine-grain SVM
+    std::shared_ptr<ICLMemoryRegion> region = std::make_shared<CLFineSVMMemoryRegion>(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, alignment);
 
-CLTensorAllocator::~CLTensorAllocator()
+    // Try coarse-grain SVM in case of failure
+    if(region != nullptr && region->ptr() == nullptr)
+    {
+        region = std::make_shared<CLCoarseSVMMemoryRegion>(context, CL_MEM_READ_WRITE, size, alignment);
+    }
+    // Try legacy buffer memory in case of failure
+    if(region != nullptr && region->ptr() == nullptr)
+    {
+        region = std::make_shared<CLBufferMemoryRegion>(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
+    }
+    return region;
+}
+} // namespace
+
+CLTensorAllocator::CLTensorAllocator(CLTensor *owner)
+    : _associated_memory_group(nullptr), _memory(), _owner(owner)
 {
-    _buffer = cl::Buffer();
 }
 
 uint8_t *CLTensorAllocator::data()
 {
-    return _mapping;
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+    return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
 }
 
 const cl::Buffer &CLTensorAllocator::cl_data() const
 {
-    return _buffer;
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+    return _memory.region()->cl_data();
 }
 
 void CLTensorAllocator::allocate()
 {
-    ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+
     if(_associated_memory_group == nullptr)
     {
-        _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info().total_size());
+        ARM_COMPUTE_ERROR_ON(_memory.region()->cl_data().get() != nullptr);
+        _memory = CLMemory(allocate_region(CLScheduler::get().context(), info().total_size(), 0));
     }
     else
     {
-        _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(&_buffer()), info().total_size());
+        _associated_memory_group->finalize_memory(_owner, _memory.region()->handle(), info().total_size());
+        _memory.region()->set_size(info().total_size());
     }
     info().set_is_resizable(false);
 }
@@ -68,41 +89,55 @@
 {
     if(_associated_memory_group == nullptr)
     {
-        _buffer = cl::Buffer();
+        _memory = CLMemory();
         info().set_is_resizable(true);
     }
 }
 
+arm_compute::Status CLTensorAllocator::import_memory(CLMemory memory)
+{
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(memory.region()->cl_data().get() == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
+    _memory = memory;
+    info().set_is_resizable(false);
+
+    return Status{};
+}
+
 void CLTensorAllocator::set_associated_memory_group(CLMemoryGroup *associated_memory_group)
 {
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
     ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
     ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
-    ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr);
+    ARM_COMPUTE_ERROR_ON(_memory.region()->cl_data().get() != nullptr);
+    _memory                  = CLMemory(std::make_shared<CLBufferMemoryRegion>(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 0));
     _associated_memory_group = associated_memory_group;
 }
 
 uint8_t *CLTensorAllocator::lock()
 {
-    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
-    _mapping = map(CLScheduler::get().queue(), true);
-    return _mapping;
+    return map(CLScheduler::get().queue(), true);
 }
 
 void CLTensorAllocator::unlock()
 {
-    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
-    unmap(CLScheduler::get().queue(), _mapping);
-    _mapping = nullptr;
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+    unmap(CLScheduler::get().queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer()));
 }
 
 uint8_t *CLTensorAllocator::map(cl::CommandQueue &q, bool blocking)
 {
-    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
-    return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info().total_size()));
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+    ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() != nullptr);
+    _memory.region()->map(q, blocking);
+    return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
 }
 
 void CLTensorAllocator::unmap(cl::CommandQueue &q, uint8_t *mapping)
 {
-    ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr);
-    q.enqueueUnmapMemObject(_buffer, mapping);
+    ARM_COMPUTE_UNUSED(mapping);
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+    ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() == nullptr);
+    _memory.region()->unmap(q);
 }

diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index df8e255..5f82cd3 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp

@@ -35,61 +35,6 @@
 
 using namespace arm_compute;
 
-namespace
-{
-/* Function to be used to intercept kernel enqueues and store their OpenCL Event */
-class Interceptor
-{
-public:
-    explicit Interceptor(CLTuner &tuner);
-
-    /** clEnqueueNDRangeKernel interface
-     *
-     * @param[in] command_queue           A valid command-queue. The kernel will be queued for execution on the device associated with command_queue.
-     * @param[in] kernel                  A valid kernel object. The OpenCL context associated with kernel and command_queue must be the same.
-     * @param[in] work_dim                The number of dimensions used to specify the global work-items and work-items in the work-group. work_dim must be greater than zero and less than or equal to CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS.
-     * @param[in] gwo                     Global-Workgroup-Offset. It can be used to specify an array of work_dim unsigned values that describe the offset used to calculate the global ID of a work-item. If global_work_offset is NULL, the global IDs start at offset (0, 0, ... 0).
-     * @param[in] gws                     Global-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of global work-items in work_dim dimensions that will execute the kernel function.
-     * @param[in] lws                     Local-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of work-items that make up a work-group
-     * @param[in] num_events_in_wait_list Number of events in the waiting list
-     * @param[in] event_wait_list         Event waiting list
-     * @param[in] event                   OpenCL kernel event
-     *
-     * @return the OpenCL status
-     */
-    cl_int operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
-                      const cl_event *event_wait_list, cl_event *event);
-
-private:
-    CLTuner &_tuner;
-};
-
-Interceptor::Interceptor(CLTuner &tuner)
-    : _tuner(tuner)
-{
-}
-
-cl_int Interceptor::operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
-                               const cl_event *event_wait_list, cl_event *event)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
-    ARM_COMPUTE_UNUSED(event);
-    if(_tuner.kernel_event_is_set())
-    {
-        // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
-        return CL_SUCCESS;
-    }
-    cl_event tmp;
-    cl_int   retval = _tuner.real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
-
-    // Set OpenCL event
-    _tuner.set_cl_kernel_event(tmp);
-
-    return retval;
-}
-
-} // namespace
-
 CLTuner::CLTuner(bool tune_new_kernels)
     : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
 {
@@ -113,7 +58,12 @@
     return _tune_new_kernels;
 }
 
-void CLTuner::tune_kernel(ICLKernel &kernel)
+void CLTuner::tune_kernel_static(ICLKernel &kernel)
+{
+    ARM_COMPUTE_UNUSED(kernel);
+}
+
+void CLTuner::tune_kernel_dynamic(ICLKernel &kernel)
 {
     // Get the configuration ID from the kernel
     const std::string &config_id = kernel.config_id();
@@ -173,7 +123,25 @@
         }
     }
     // Start intercepting enqueues:
-    CLSymbols::get().clEnqueueNDRangeKernel_ptr = Interceptor(*this);
+    auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+                              const cl_event * event_wait_list, cl_event * event)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
+        ARM_COMPUTE_UNUSED(event);
+        if(this->kernel_event_is_set())
+        {
+            // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
+            return CL_SUCCESS;
+        }
+        cl_event tmp;
+        cl_int   retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
+
+        // Set OpenCL event
+        this->set_cl_kernel_event(tmp);
+
+        return retval;
+    };
+    CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
 
     cl_ulong min_exec_time = std::numeric_limits<cl_ulong>::max();
 

diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
new file mode 100644
index 0000000..ff50073
--- /dev/null
+++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLChannelShuffleLayer::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLChannelShuffleLayerKernel>();
+    k->configure(input, output, num_groups);
+    _kernel = std::move(k);
+}
+
+Status CLChannelShuffleLayer::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+{
+    return CLChannelShuffleLayerKernel::validate(input, output, num_groups);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
new file mode 100644
index 0000000..c226e56
--- /dev/null
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp

@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
+
+using namespace arm_compute;
+
+void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
+                                               DataLayout data_layout)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLConvertFullyConnectedWeightsKernel>();
+    k->configure(input, output, original_input_shape, data_layout);
+    _kernel = std::move(k);
+}
+
+Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
+                                                DataLayout data_layout)
+{
+    return CLConvertFullyConnectedWeightsKernel::validate(input, output, original_input_shape, data_layout);
+}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 1a486ce..47a8d5f 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp

@@ -42,25 +42,34 @@
 {
 }
 
-void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
+                                                            enable_fast_math));
 
-    switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
-                                                      weights_info, CLScheduler::get().target()))
+    switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
+                                                      weights_info, act_info, CLScheduler::get().target(), dilation, enable_fast_math))
     {
+        case ConvolutionMethod::WINOGRAD:
+        {
+            auto f = arm_compute::support::cpp14::make_unique<CLWinogradConvolutionLayer>(_memory_manager);
+            f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
+            _function = std::move(f);
+            break;
+        }
         case ConvolutionMethod::DIRECT:
         {
             auto f = arm_compute::support::cpp14::make_unique<CLDirectConvolutionLayer>();
-            f->configure(input, weights, biases, output, conv_info);
+            f->configure(input, weights, biases, output, conv_info, act_info);
             _function = std::move(f);
             break;
         }
         case ConvolutionMethod::GEMM:
         {
             auto f = arm_compute::support::cpp14::make_unique<CLGEMMConvolutionLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info, weights_info);
+            f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info);
             _function = std::move(f);
             break;
         }
@@ -71,25 +80,30 @@
 }
 
 Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info)
+                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
 
-    //Configure if the parameters match the direct convolution or the gemm-based
     const GPUTarget gpu_target = CLScheduler::get().target();
 
-    switch(CLConvolutionLayer::get_convolution_method(input, weights, biases, output, conv_info, weights_info, gpu_target))
+    switch(CLConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, act_info, gpu_target, dilation, enable_fast_math))
     {
+        case ConvolutionMethod::WINOGRAD:
+        {
+            //Validate Winograd
+            ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
+            break;
+        }
         case ConvolutionMethod::DIRECT:
         {
             // Validate direct convolution layer
-            CLDirectConvolutionLayer::validate(input, weights, biases, output, conv_info);
+            ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
             break;
         }
         case ConvolutionMethod::GEMM:
         {
             // Validate gemm-based convolution layer
-            CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info);
+            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info));
             break;
         }
         default:
@@ -100,21 +114,34 @@
     return Status{};
 }
 
-ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                             const WeightsInfo &weights_info, const GPUTarget gpu_target)
+ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                                             const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math)
 {
-    ARM_COMPUTE_UNUSED(input);
-    ARM_COMPUTE_UNUSED(weights);
-    ARM_COMPUTE_UNUSED(biases);
-    ARM_COMPUTE_UNUSED(output);
-    ARM_COMPUTE_UNUSED(conv_info);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
     ARM_COMPUTE_UNUSED(weights_info);
     ARM_COMPUTE_UNUSED(gpu_target);
 
-    return ConvolutionMethod::GEMM;
+    const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+
+    if(dilation != Size2D(1U, 1U) || (input->dimension(idx_c) < 16))
+    {
+        return ConvolutionMethod::GEMM;
+    }
+    else
+    {
+        return bool(CLWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+    }
 }
 
 void CLConvolutionLayer::run()
 {
+    prepare();
     _function->run();
 }
+
+void CLConvolutionLayer::prepare()
+{
+    _function->prepare();
+}

diff --git a/src/graph/CL/CLMap.cpp b/src/runtime/CL/functions/CLCopy.cpp
similarity index 68%
rename from src/graph/CL/CLMap.cpp
rename to src/runtime/CL/functions/CLCopy.cpp
index 5289ea9..3442e37 100644
--- a/src/graph/CL/CLMap.cpp
+++ b/src/runtime/CL/functions/CLCopy.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,23 +21,23 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/CL/CLMap.h"
+#include "arm_compute/runtime/CL/functions/CLCopy.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
 
-using namespace arm_compute::graph;
+#include <utility>
 
-CLMap::CLMap(ITensorObject *tensor, bool blocking)
-    : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor())), _blocking(blocking)
+using namespace arm_compute;
+
+void CLCopy::configure(ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
-}
-
-void CLMap::run()
-{
-    _tensor->map(arm_compute::CLScheduler::get().queue(), _blocking);
+    auto k = arm_compute::support::cpp14::make_unique<CLCopyKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 9e6c0b4..cb8dc02 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp

@@ -80,7 +80,7 @@
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(inner_border_right, inner_border_top), info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, info, WeightsInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo()));
 
     return Status{};
 }

diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 88e9376..676a121 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp

@@ -24,6 +24,8 @@
 #include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
@@ -35,17 +37,27 @@
 using namespace arm_compute::misc::shape_calculator;
 
 CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3()
-    : _kernel(), _border_handler()
+    : _kernel(nullptr), _border_handler()
 {
 }
 
-void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
+                                               ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
-    _kernel.set_target(CLScheduler::get().target());
-    _kernel.configure(input, weights, biases, output, conv_info);
+    if(input->info()->data_layout() == DataLayout::NCHW)
+    {
+        _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
+    }
+    else
+    {
+        _kernel = arm_compute::support::cpp14::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
+    }
+
+    _kernel->set_target(CLScheduler::get().target());
+    _kernel->configure(input, weights, biases, output, conv_info, depth_multiplier, act_info);
 
     // Configure border handler
     PixelValue &&zero_value(0.f);
@@ -53,42 +65,62 @@
     {
         zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
     }
-    _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, zero_value);
+    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, zero_value);
+}
+
+Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                                unsigned int        depth_multiplier,
+                                                ActivationLayerInfo act_info, GPUTarget gpu_target)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+
+    if(input->data_layout() == DataLayout::NCHW)
+    {
+        return CLDepthwiseConvolutionLayer3x3NCHWKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target);
+    }
+
+    return CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info);
 }
 
 void CLDepthwiseConvolutionLayer3x3::run()
 {
     CLScheduler::get().enqueue(_border_handler);
-    CLScheduler::get().enqueue(_kernel);
+    CLScheduler::get().enqueue(*_kernel);
 }
 
 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
-      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_quantized(false)
+      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
 {
 }
 
-void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
 
     const size_t weights_w = weights->info()->dimension(0);
     const size_t weights_h = weights->info()->dimension(1);
     const size_t weights_z = weights->info()->dimension(2);
 
-    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_first_run     = true;
+    _original_weights = weights;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
 
     bool            append_bias = (biases != nullptr) && !_is_quantized;
     const GPUTarget gpu_target  = CLScheduler::get().target();
 
     // Calculate output shape
-    TensorShape dwc_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info);
+    TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
 
     // Output width and height
-    const unsigned int conv_w = dwc_output_shape.x();
-    const unsigned int conv_h = dwc_output_shape.y();
+    const unsigned int conv_w = output_shape.x();
+    const unsigned int conv_h = output_shape.y();
 
     // Set up intermediate tensors
     const size_t patch_size = weights_w * weights_h + ((append_bias) ? 1 : 0);
@@ -101,7 +133,7 @@
     shape_im2col.set(2, weights_z);
     _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
     _im2col_kernel.set_target(gpu_target);
-    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias);
+    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
 
     // Weights reshape configuration
     const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -117,7 +149,7 @@
     _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
     _v2mm_kernel.set_target(gpu_target);
     _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
-    _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dwc_output_shape));
+    _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
     _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
 
     // Output staged configuration
@@ -152,18 +184,72 @@
     _v2mm_output.allocator()->allocate();
 }
 
+Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                             unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(2) * depth_multiplier) != weights->dimension(2));
+
+    const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+    const bool         append_bias  = (biases != nullptr) && !is_quantized;
+    const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+    const size_t       weights_w    = weights->dimension(0);
+    const size_t       weights_h    = weights->dimension(1);
+    const size_t       weights_z    = weights->dimension(2);
+    const unsigned int conv_w       = output_shape.x();
+    const unsigned int conv_h       = output_shape.y();
+    const size_t       patch_size   = weights_w * weights_h + ((append_bias) ? 1 : 0);
+    const size_t       conv_size    = conv_w * conv_h;
+
+    TensorShape shape_im2col = input->tensor_shape();
+    shape_im2col.set(0, patch_size);
+    shape_im2col.set(1, conv_size);
+    shape_im2col.set(2, weights_z);
+    TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+
+    const TensorShape shape_weights_reshape(patch_size, weights_z);
+    TensorInfo        weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseWeightsReshapeKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
+
+    DataType    v2mm_dt        = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+    TensorShape shape_v2mm_out = input->tensor_shape();
+    shape_v2mm_out.set(0, conv_size * weights_z);
+    shape_v2mm_out.set(1, 1);
+    shape_v2mm_out.set(2, 1);
+    TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
+
+    TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
+
+    if(is_quantized)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
+    }
+
+    return Status{};
+}
+
 void CLDepthwiseConvolutionLayer::run()
 {
+    // Run weights reshaping (Runs once for every configure)
+    if(_is_first_run)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        CLScheduler::get().enqueue(_weights_reshape_kernel);
+        CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+        _is_first_run = false;
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
+    }
+
     CLScheduler::get().enqueue(_im2col_kernel);
-
-    CLScheduler::get().enqueue(_weights_reshape_kernel);
-
     CLScheduler::get().enqueue(_v2mm_input_fill_border);
-    CLScheduler::get().enqueue(_v2mm_weights_fill_border);
     CLScheduler::get().enqueue(_v2mm_kernel);
-
     CLScheduler::get().enqueue(_vector_to_tensor_kernel);
-
     if(_is_quantized)
     {
         CLScheduler::get().enqueue(_output_stage_kernel);

diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 5559d42..6f33b2e 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 
 #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute;
@@ -33,8 +34,18 @@
 {
 }
 
+Status CLDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayerKernel::validate(input, output, min_max));
+
+    return Status{};
+}
+
 void CLDequantizationLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *min_max)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
+
     _dequantize_kernel.configure(input, output, min_max);
 }
 

diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index d6a335c..c451bd4 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,11 +33,11 @@
 using namespace arm_compute;
 
 CLDirectConvolutionLayer::CLDirectConvolutionLayer()
-    : _direct_conv_kernel(), _input_border_handler()
+    : _direct_conv_kernel(), _input_border_handler(), _activationlayer_function(), _is_activationlayer_enabled(false)
 {
 }
 
-void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
 {
     // Set GPU target
     _direct_conv_kernel.set_target(CLScheduler::get().target());
@@ -52,11 +52,28 @@
         zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
     }
     _input_border_handler.configure(input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(_direct_conv_kernel);
+
+    _is_activationlayer_enabled = act_info.enabled();
+
+    //Configure Activation Layer
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
 }
 
-Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                          const ActivationLayerInfo &act_info)
 {
-    return CLDirectConvolutionLayerKernel::validate(input, weights, biases, output, conv_info, CLScheduler::get().target());
+    ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerKernel::validate(input, weights, biases, output, conv_info, CLScheduler::get().target()));
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+    }
+    return Status{};
 }
 
 void CLDirectConvolutionLayer::run()
@@ -66,4 +83,10 @@
 
     // Run direct convolution
     CLScheduler::get().enqueue(_direct_conv_kernel);
+
+    //Run Activation Layer
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
 }

diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 2b4670b..151fa1b 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp

@@ -37,10 +37,8 @@
 
 namespace
 {
-Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output, bool is_interleaved_transposed)
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
 {
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
     if(is_data_type_quantized_asymmetric(input.data_type()))
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
@@ -55,7 +53,7 @@
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&input, &weights, &output, 1.f, is_interleaved_transposed, GEMMReshapeInfo(), gpu_target));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input, &weights, nullptr, &output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */)));
     }
 
     return Status{};
@@ -75,12 +73,12 @@
 }
 
 CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(), _im2col_output(),
-      _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false)
+    : _memory_group(memory_manager), _im2col_kernel(), _reshape_weights_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(),
+      _im2col_output(), _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false), _original_weights(nullptr)
 {
 }
 
-void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool is_interleaved_transposed)
+void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
 {
     if(_is_quantized)
     {
@@ -102,8 +100,7 @@
     else
     {
         // Configure matrix multiply kernel
-        _mm_kernel.set_target(CLScheduler::get().target());
-        _mm_kernel.configure(input, weights, output, 1.f, is_interleaved_transposed);
+        _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */));
     }
 }
 
@@ -114,7 +111,7 @@
     // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
 
     // Initialize output tensor for im2col
-    TensorShape shape_im2col = compute_im2col_shape(input->info());
+    TensorShape shape_im2col = compute_im2col_fc_shape(input->info());
     _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
 
     // Configure im2col kernel
@@ -122,7 +119,7 @@
     _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
 
     // Configure matrix multiply kernel
-    configure_mm(&_im2col_output, weights, output, false);
+    configure_mm(&_im2col_output, weights, output);
 
     // Allocate the output tensor for im2col once all the configure methods have been called
     _im2col_output.allocator()->allocate();
@@ -133,7 +130,7 @@
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
 
     // Configure matrix multiply kernel
-    configure_mm(input, weights, output, false);
+    configure_mm(input, weights, output);
 }
 
 void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
@@ -152,6 +149,7 @@
     _is_fc_after_conv     = true;
     _accumulate_biases    = false;
     _is_quantized         = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _original_weights     = weights;
 
     // Configure gemmlowp output
     if(_is_quantized)
@@ -222,13 +220,6 @@
         _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
         _gemmlowp_output.allocator()->allocate();
     }
-
-    // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
-    if(!_are_weights_reshaped)
-    {
-        // Allocate the tensor for the weights reshaped
-        _reshape_weights_output.allocator()->allocate();
-    }
 }
 
 Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose_weights, bool are_weights_reshaped)
@@ -243,7 +234,7 @@
     bool            is_quantized     = is_data_type_quantized_asymmetric(input->data_type());
     const GPUTarget gpu_target       = CLScheduler::get().target();
 
-    const ITensorInfo &im2col_input     = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_shape(input)));
+    const ITensorInfo &im2col_input     = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input)));
     const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
     const ITensorInfo &gemmlowp_output  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
 
@@ -300,7 +291,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
     }
     // Validate matrix multiply kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output, false));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output));
 
     // Validate output stage for asymmetric quantized types
     if(is_quantized)
@@ -313,12 +304,7 @@
 
 void CLFullyConnectedLayer::run()
 {
-    // Reshape of the weights (happens only once)
-    if(!_are_weights_reshaped)
-    {
-        _are_weights_reshaped = true;
-        _reshape_weights_kernel.run();
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -335,7 +321,7 @@
     }
     else
     {
-        CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
+        _mm_gemm.run();
     }
 
     // Accumulate biases if provided
@@ -353,3 +339,30 @@
 
     _memory_group.release();
 }
+
+void CLFullyConnectedLayer::prepare()
+{
+    // Reshape of the weights (happens only once)
+    if(!_are_weights_reshaped)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run reshape weights kernel and mark weights as unused
+        _reshape_weights_output.allocator()->allocate();
+        _reshape_weights_kernel.run();
+        _original_weights->mark_as_unused();
+
+        // Prepare GEMM prepare and release unused weights
+        if(!_is_quantized)
+        {
+            _mm_gemm.prepare();
+            if(!_reshape_weights_output.is_used())
+            {
+                _reshape_weights_output.allocator()->free();
+            }
+        }
+
+        CLScheduler::get().queue().finish();
+        _are_weights_reshaped = true;
+    }
+}

diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 6b5cd2d..f81da6c 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp

@@ -29,14 +29,18 @@
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
@@ -44,9 +48,10 @@
 {
     bool flag = true;
 
-    if(gpu_target == GPUTarget::BIFROST)
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
     {
-        if(k > 256 && m > 4 && data_type == DataType::F32 && reshape_b_only_on_first_run)
+        // COMPMID-852
+        if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
         {
             const float scale = k < 1024 ? 2.0f : 2.5f;
             flag              = (scale * n) > ((1.66f * n) + 38.4f);
@@ -56,39 +61,19 @@
             flag = false;
         }
     }
-
-    return flag;
-}
-
-Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const ICLTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info = GEMMInfo())
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-
-    if(c != nullptr)
+    else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info());
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(c->info()->dimension(0) != output->dimension(0), "The C matrix must have the same number of rows as the output matrix");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(c->info()->dimension(1) != output->dimension(1), "The C matrix must have the same number of columns as the output matrix");
+        // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
+        flag = m != 1 && reshape_b_only_on_first_run;
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_UNUSED(beta);
-    return Status{};
+    return flag;
 }
 } // namespace
 
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
-      _is_first_run(true), _reshape_b_only_on_first_run(false)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false),
+      _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
 {
 }
 
@@ -97,10 +82,14 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(a->info(), b->info(), c, output->info(), alpha, beta, gemm_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
+
+    // Store original b matrix
+    _original_b = b;
 
     // Check if we need to reshape the matrix B only on the first run
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    _is_prepared                 = false;
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
@@ -121,7 +110,7 @@
     int       mult_transpose1xW_width   = 1;
     int       mult_interleave4x4_height = 1;
 
-    if(gpu_target == GPUTarget::BIFROST)
+    if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
         mult_transpose1xW_width   = 4;
         mult_interleave4x4_height = 2;
@@ -137,8 +126,10 @@
 
         // Manage intermediate buffers
         _memory_group.manage(&_tmp_a);
-        _memory_group.manage(&_tmp_b);
-
+        if(!_reshape_b_only_on_first_run)
+        {
+            _memory_group.manage(&_tmp_b);
+        }
         // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
 
         // Configure interleave kernel
@@ -154,7 +145,10 @@
     {
         // Allocate intermediate tensors
         _tmp_a.allocator()->allocate();
-        _tmp_b.allocator()->allocate();
+        if(!_reshape_b_only_on_first_run)
+        {
+            _tmp_b.allocator()->allocate();
+        }
     }
 
     // Configure matrix addition kernel
@@ -165,14 +159,74 @@
     }
 }
 
-Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ICLTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info)
+Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(a, b, c, output, alpha, beta, gemm_info));
+    ARM_COMPUTE_UNUSED(alpha);
+
+    // Check if we need to reshape the matrix B only on the first run
+    const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+
+    const ITensorInfo *matrix_a_info = a;
+    const ITensorInfo *matrix_b_info = b;
+
+    TensorInfo tmp_a_info{};
+    TensorInfo tmp_b_info{};
+    TensorInfo tmp_output_info = *output->clone();
+
+    // Get the GPU target
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    // Arguments used by GEMMReshapeInfo
+    // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
+    // in order to know how the matrices have been reshaped
+    const int m                         = a->dimension(1);
+    const int n                         = b->dimension(0);
+    const int k                         = a->dimension(0);
+    int       mult_transpose1xW_width   = 1;
+    int       mult_interleave4x4_height = 1;
+
+    if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
+    {
+        mult_transpose1xW_width   = 4;
+        mult_interleave4x4_height = 2;
+    }
+
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height);
+
+    // Check if we need to reshape the matrix A and matrix B
+    const bool run_interleave_transpose = is_interleaved_transposed(m, n, k, a->data_type(), reshape_b_only_on_first_run, gpu_target);
+
+    if(run_interleave_transpose)
+    {
+        matrix_a_info = &tmp_a_info;
+        matrix_b_info = &tmp_b_info;
+
+        // Validate interleave kernel
+        auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height));
+
+        // Validate transpose kernel
+        auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
+    }
+
+    // Validate matrix multiply
+    auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info, gpu_target));
+
+    if(beta != 0 && c != nullptr)
+    {
+        // Validate matrix addition kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, &tmp_output_info, beta));
+    }
+
     return Status{};
 }
 
 void CLGEMM::run()
 {
+    prepare();
+
     _memory_group.acquire();
 
     if(_is_interleaved_transposed)
@@ -180,14 +234,7 @@
         // Run interleave kernel
         CLScheduler::get().enqueue(_interleave_kernel, false);
 
-        if(_is_first_run)
-        {
-            // Run transpose kernel
-            CLScheduler::get().enqueue(_transpose_kernel, false);
-
-            _is_first_run = false;
-        }
-        else if(!_reshape_b_only_on_first_run)
+        if(!_reshape_b_only_on_first_run)
         {
             // Run transpose kernel
             CLScheduler::get().enqueue(_transpose_kernel, false);
@@ -205,3 +252,19 @@
 
     _memory_group.release();
 }
+
+void CLGEMM::prepare()
+{
+    if(!_is_prepared)
+    {
+        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+        {
+            // Run transpose kernel
+            _tmp_b.allocator()->allocate();
+            CLScheduler::get().enqueue(_transpose_kernel, false);
+            _original_b->mark_as_unused();
+        }
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
+}

diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index c58af36..79495e4 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp

@@ -38,8 +38,8 @@
 using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
 
-CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped()
+CLConvolutionLayerReshapeWeights::CLConvolutionLayerReshapeWeights()
+    : _weights_reshape_kernel()
 {
 }
 
@@ -86,16 +86,12 @@
 
 void CLConvolutionLayerReshapeWeights::run()
 {
-    _memory_group.acquire();
-
     CLScheduler::get().enqueue(_weights_reshape_kernel);
-
-    _memory_group.release();
 }
 
 CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _im2col_output(),
-      _interleave_output(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_first_run(true)
+    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
+      _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
@@ -155,7 +151,8 @@
     return Status{};
 }
 
-void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+                                       const Size2D &dilation, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -164,9 +161,13 @@
                                                                 biases != nullptr ? biases->info() : nullptr,
                                                                 output->info(),
                                                                 conv_info,
-                                                                weights_info));
+                                                                weights_info,
+                                                                dilation,
+                                                                act_info));
 
-    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_prepared      = false;
+    _original_weights = weights;
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
 
     const DataType dt = input->info()->data_type();
 
@@ -191,7 +192,7 @@
     const unsigned int kernel_width  = weights->info()->dimension(0);
     const unsigned int kernel_height = weights->info()->dimension(1);
     std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
-                                                 conv_info);
+                                                 conv_info, dilation);
 
     unsigned int mat_weights_cols = weights->info()->dimension(3);
     unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
@@ -226,7 +227,7 @@
     _memory_group.manage(&_gemm_output);
 
     // Configure im2col
-    _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias);
+    _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation);
 
     // Configure GEMM
     configure_mm(&_im2col_output, weights, &_gemm_output);
@@ -255,14 +256,19 @@
 
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
 
-    // Allocate intermediate tensor
-    _weights_reshaped.allocator()->allocate();
+    //Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
 
     ARM_COMPUTE_UNUSED(weights_info);
 }
 
 Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info)
+                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
@@ -272,6 +278,11 @@
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
+    }
+
     const bool     is_quantized = is_data_type_quantized_asymmetric(input->data_type());
     const bool     append_bias  = (biases != nullptr) && (!is_quantized);
     const unsigned bias_element = (append_bias) ? 1 : 0;
@@ -284,12 +295,12 @@
     const unsigned int kernel_width  = weights->dimension(0);
     const unsigned int kernel_height = weights->dimension(1);
 
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height, conv_info);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height, conv_info, dilation);
 
     unsigned int mat_weights_cols = weights->dimension(3);
     unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + bias_element;
 
-    CLConvolutionLayerReshapeWeights::validate(weights, biases, nullptr);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr));
 
     // Create tensor info for im2col reshaped inputs
     const unsigned int mat_input_cols = mat_weights_rows;
@@ -300,7 +311,7 @@
     shape_im2col.set(2, 1);
     TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->fixed_point_position());
     im2col_reshaped_info.set_quantization_info(input->quantization_info());
-    CLIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
 
     // Create GEMM output tensor
     TensorShape shape_gemm = im2col_reshaped_info.tensor_shape();
@@ -311,9 +322,10 @@
     TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->fixed_point_position());
     info_gemm.set_quantization_info(output->quantization_info());
 
-    validate_mm(&im2col_reshaped_info, weights, &info_gemm);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(&im2col_reshaped_info, weights, &info_gemm));
+    TensorInfo tmp_info(shape_gemm, 1, DataType::QASYMM8, input->fixed_point_position());
+    tmp_info.set_quantization_info(output->quantization_info());
 
-    TensorInfo tmp_info(input->tensor_shape(), 1, DataType::QASYMM8, input->fixed_point_position());
     if(is_quantized)
     {
         float multiplier = input->quantization_info().scale * weights->quantization_info().scale / output->quantization_info().scale;
@@ -324,7 +336,7 @@
     }
 
     // Validate Col2Im
-    CLCol2ImKernel::validate(is_quantized ? &tmp_info : &info_gemm, output, std::make_pair(conv_w, conv_h));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(is_quantized ? &tmp_info : &info_gemm, output, std::make_pair(conv_w, conv_h)));
 
     if(biases != nullptr)
     {
@@ -341,18 +353,18 @@
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
     }
 
+    //Validate Activation Layer
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+    }
+
     return Status{};
 }
 
 void CLGEMMConvolutionLayer::run()
 {
-    // Run weights reshaping (Runs once for every configure)
-    if(_is_first_run)
-    {
-        _reshape_weights.run();
-
-        _is_first_run = false;
-    }
+    prepare();
 
     _memory_group.acquire();
 
@@ -377,5 +389,36 @@
     // Reshape output matrix
     CLScheduler::get().enqueue(_col2im_kernel, false);
 
+    //Run Activation Layer if enabled
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
+
     _memory_group.release();
 }
+
+void CLGEMMConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Run weights reshaping and mark as unused
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+        _weights_reshaped.allocator()->allocate();
+        _reshape_weights.run();
+        _original_weights->mark_as_unused();
+
+        // Run GEMM prepare
+        if(!_is_quantized)
+        {
+            _mm_gemm.prepare();
+            if(!_weights_reshaped.is_used())
+            {
+                _weights_reshaped.allocator()->free();
+            }
+        }
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
+}

diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index c688299..711b006 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp

@@ -41,7 +41,7 @@
 {
     bool flag = true;
 
-    if(gpu_target == GPUTarget::BIFROST)
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX))
     {
         // COMPMID-852
         if(k > 256 && m > 4 && reshape_b_only_on_first_run)
@@ -102,7 +102,10 @@
         matrix_b = &_tmp_b;
 
         _memory_group.manage(&_tmp_a);
-        _memory_group.manage(&_tmp_b);
+        if(!_reshape_b_only_on_first_run)
+        {
+            _memory_group.manage(&_tmp_b);
+        }
 
         // Configure interleave kernel
         _mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height);
@@ -119,7 +122,10 @@
     {
         TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
         _vector_sum_col.allocator()->init(info_vector_sum_col);
-        _memory_group.manage(&_vector_sum_col);
+        if(!_reshape_b_only_on_first_run)
+        {
+            _memory_group.manage(&_vector_sum_col);
+        }
 
         // Configure Matrix B reduction kernel
         _mtx_b_reduction_kernel.configure(b, &_vector_sum_col);

diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index 4b32954..ddce5fb 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,8 @@
 }
 
 CLGaussianPyramidHalf::CLGaussianPyramidHalf() // NOLINT
-    : _border_handler(),
+    : _horizontal_border_handler(),
+      _vertical_border_handler(),
       _horizontal_reduction(),
       _vertical_reduction()
 {
@@ -64,6 +65,9 @@
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
     ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale());
 
+    // Constant value to use for vertical fill border when the border mode is CONSTANT
+    const uint16_t pixel_value_u16 = static_cast<uint16_t>(constant_border_value) * 2 + static_cast<uint16_t>(constant_border_value) * 8 + static_cast<uint16_t>(constant_border_value) * 6;
+
     /* Get number of pyramid levels */
     const size_t num_levels = pyramid->info()->num_levels();
 
@@ -72,28 +76,31 @@
 
     if(num_levels > 1)
     {
-        _border_handler       = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
-        _horizontal_reduction = arm_compute::support::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
-        _vertical_reduction   = arm_compute::support::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
+        _horizontal_border_handler = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
+        _vertical_border_handler   = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(num_levels - 1);
+        _horizontal_reduction      = arm_compute::support::cpp14::make_unique<CLGaussianPyramidHorKernel[]>(num_levels - 1);
+        _vertical_reduction        = arm_compute::support::cpp14::make_unique<CLGaussianPyramidVertKernel[]>(num_levels - 1);
 
         // Apply half scale to the X dimension of the tensor shape
         TensorShape tensor_shape = pyramid->info()->tensor_shape();
         tensor_shape.set(0, (pyramid->info()->width() + 1) * SCALE_PYRAMID_HALF);
 
         PyramidInfo pyramid_info(num_levels - 1, SCALE_PYRAMID_HALF, tensor_shape, Format::U16);
-
         _tmp.init(pyramid_info);
 
         for(size_t i = 0; i < num_levels - 1; ++i)
         {
             /* Configure horizontal kernel */
-            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode == BorderMode::UNDEFINED);
+            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
 
             /* Configure vertical kernel */
-            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), border_mode == BorderMode::UNDEFINED);
+            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
 
             /* Configure border */
-            _border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+            _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+
+            /* Configure border */
+            _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
         }
         _tmp.allocate();
     }
@@ -110,13 +117,15 @@
     _pyramid->get_pyramid_level(0)->map(CLScheduler::get().queue(), true /* blocking */);
     _input->map(CLScheduler::get().queue(), true /* blocking */);
     _pyramid->get_pyramid_level(0)->copy_from(*_input);
+
     _input->unmap(CLScheduler::get().queue());
     _pyramid->get_pyramid_level(0)->unmap(CLScheduler::get().queue());
 
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
-        CLScheduler::get().enqueue(_border_handler[i], false);
+        CLScheduler::get().enqueue(_horizontal_border_handler[i], false);
         CLScheduler::get().enqueue(_horizontal_reduction[i], false);
+        CLScheduler::get().enqueue(_vertical_border_handler[i], false);
         CLScheduler::get().enqueue(_vertical_reduction[i], false);
     }
 }

diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index d1bb65f..a3010a7 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,6 +52,26 @@
     _sumsq.allocator()->allocate();
 }
 
+Status CLL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, float epsilon)
+{
+    TensorShape shape(input->tensor_shape());
+
+    // Create intermediate tensor info
+    TensorInfo sum_sq;
+    sum_sq.set_data_type(input->data_type());
+    sum_sq.set_tensor_shape(shape);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, axis, ReductionOperation::SUM_SQUARE));
+
+    // Reduce shape on axis (supported axis is 0)
+    shape.set(0, 1);
+    sum_sq.set_tensor_shape(shape);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLL2NormalizeLayerKernel::validate(input, &sum_sq, output, axis, epsilon));
+
+    return Status{};
+}
+
 void CLL2NormalizeLayer::run()
 {
     _memory_group.acquire();

diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
new file mode 100644
index 0000000..930d311
--- /dev/null
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp

@@ -0,0 +1,508 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _gemm_input_gate1(), _gemm_input_gate2(), _transpose_input_gate1(), _transpose_input_gate2(), _accum_input_gate1(),
+      _accum_input_gate2(), _subtract_input_gate(), _activation_input_gate(), _fully_connected_forget_gate(), _gemm_forget_gate1(), _gemm_forget_gate2(), _transpose_forget_gate1(),
+      _transpose_forget_gate2(), _accum_forget_gate1(), _accum_forget_gate2(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state1(),
+      _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output1(),
+      _gemm_output2(), _transpose_output1(), _transpose_output2(), _accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state(),
+      _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(), _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _input_gate_out1(), _input_gate_out2(),
+      _input_gate_out3(), _input_gate_out4(), _input_gate_out5(), _input_gate_out6(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(),
+      _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _output5(), _output6(),
+      _cell_state_activation(), _output_projection1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false),
+      _perform_projection_clipping(false)
+{
+}
+
+void CLLSTMLayer::configure(const ICLTensor *input, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
+                            const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
+                            const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
+                            ICLTensor *output_state, ICLTensor *cell_state, ICLTensor *scratch_buffer, ICLTensor *output, const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info,
+                            float cell_threshold, float projection_threshold)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                                 forget_gate_bias, cell_bias, output_gate_bias, output_state, cell_state);
+    LSTMParams<ITensorInfo> lstm_params_info;
+    if(lstm_params.has_peephole_opt())
+    {
+        lstm_params_info.set_peephole_params(lstm_params.cell_to_input_weights()->info(), lstm_params.cell_to_forget_weights()->info(), lstm_params.cell_to_output_weights()->info());
+    }
+    if(lstm_params.has_projection())
+    {
+        lstm_params_info.set_projection_params(lstm_params.projection_weights()->info(), lstm_params.projection_bias()->info());
+    }
+    if(!lstm_params.has_cifg_opt())
+    {
+        lstm_params_info.set_cifg_params(lstm_params.input_to_input_weights()->info(), lstm_params.recurrent_to_input_weights()->info(),
+                                         lstm_params.cell_to_input_weights()->info(), lstm_params.input_gate_bias()->info());
+    }
+    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(),
+                                                     input_to_cell_weights->info(), input_to_output_weights->info(),
+                                                     recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+                                                     forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+                                                     output_state->info(), cell_state->info(), scratch_buffer->info(), output->info(), lstm_params_info,
+                                                     activation_info, cell_threshold, projection_threshold));
+
+    const TensorShape cell_state_shape = cell_state->info()->tensor_shape();
+
+    TensorShape forget_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+    TensorShape forget_gate2_shape = compute_transposed_shape(*forget_gate_bias->info());
+    TensorShape forget_gate3_shape{ 1, output_state->info()->dimension(1) };
+    _forget_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _forget_gate_out2.allocator()->init(TensorInfo(forget_gate1_shape, 1, input->info()->data_type()));
+    _forget_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _forget_gate_out6.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+    // Configure block that calculates the forget gate
+    // forget_gate = Activation(input * input_to_forget_weights + output_state * recurrent_to_forget_weights + cell_state * cell_to_forget_weights + forget_gate_bias)
+    _memory_group.manage(&_forget_gate_out1);
+    _fully_connected_forget_gate.configure(input, input_to_forget_weights, forget_gate_bias, &_forget_gate_out1, true, false);
+    _memory_group.manage(&_forget_gate_out2);
+    _transpose_forget_gate1.configure(recurrent_to_forget_weights, &_forget_gate_out2);
+    _memory_group.manage(&_forget_gate_out3);
+    _gemm_forget_gate1.configure(output_state, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
+    _forget_gate_out2.allocator()->allocate();
+    _memory_group.manage(&_forget_gate_out6);
+    _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out6, ConvertPolicy::SATURATE);
+    CLTensor *forget_gate_out = &_forget_gate_out6;
+
+    if(lstm_params.has_peephole_opt())
+    {
+        _forget_gate_out4.allocator()->init(TensorInfo(forget_gate2_shape, 1, input->info()->data_type()));
+        _forget_gate_out5.allocator()->init(TensorInfo(forget_gate3_shape, 1, input->info()->data_type()));
+
+        _run_peephole_opt = true;
+        _memory_group.manage(&_forget_gate_out4);
+        _transpose_forget_gate2.configure(lstm_params.cell_to_forget_weights(), &_forget_gate_out4);
+        _memory_group.manage(&_forget_gate_out5);
+        _gemm_forget_gate2.configure(cell_state, &_forget_gate_out4, nullptr, &_forget_gate_out5, 1.f, 0.f);
+        _forget_gate_out4.allocator()->allocate();
+        _accum_forget_gate2.configure(&_forget_gate_out6, &_forget_gate_out5, &_forget_gate_out3, ConvertPolicy::SATURATE);
+        _forget_gate_out5.allocator()->allocate();
+        _forget_gate_out6.allocator()->allocate();
+        forget_gate_out = &_forget_gate_out3;
+    }
+    else
+    {
+        _forget_gate_out3.allocator()->allocate();
+    }
+    _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    forget_gate_out->allocator()->allocate();
+
+    TensorShape input_gate3_shape{ 1, output_state->info()->dimension(1) };
+    _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _input_gate_out5.allocator()->init(TensorInfo(input_gate3_shape, 1, input->info()->data_type()));
+
+    // Configure block that calculates the input gate
+    // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + cell_state * cell_to_input_weights + input_gate_bias), without CIFG
+    // input_gate = 1 - forget_gate, with CIFG
+    if(lstm_params.has_cifg_opt())
+    {
+        _memory_group.manage(&_input_gate_out1);
+        _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _subtract_input_gate.configure(&_ones, &_forget_gate_out1, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _ones.allocator()->allocate();
+        _run_cifg_opt = true;
+    }
+    else
+    {
+        TensorShape input_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+        TensorShape input_gate2_shape = compute_transposed_shape(*lstm_params.cell_to_input_weights()->info());
+
+        _input_gate_out2.allocator()->init(TensorInfo(input_gate1_shape, 1, input->info()->data_type()));
+        _input_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _input_gate_out4.allocator()->init(TensorInfo(input_gate2_shape, 1, input->info()->data_type()));
+        _input_gate_out6.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+        _memory_group.manage(&_input_gate_out1);
+        _fully_connected_input_gate.configure(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &_input_gate_out1, true, false);
+        _memory_group.manage(&_input_gate_out2);
+        _transpose_input_gate1.configure(lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
+        _memory_group.manage(&_input_gate_out3);
+        _gemm_input_gate1.configure(output_state, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
+        _input_gate_out2.allocator()->allocate();
+        _memory_group.manage(&_input_gate_out4);
+        _transpose_input_gate2.configure(lstm_params.cell_to_input_weights(), &_input_gate_out4);
+        _memory_group.manage(&_input_gate_out5);
+        _gemm_input_gate2.configure(cell_state, &_input_gate_out4, nullptr, &_input_gate_out5, 1.f, 0.f);
+        _input_gate_out4.allocator()->allocate();
+        _memory_group.manage(&_input_gate_out6);
+        _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out6, ConvertPolicy::SATURATE);
+        _input_gate_out3.allocator()->allocate();
+        _accum_input_gate2.configure(&_input_gate_out6, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _input_gate_out5.allocator()->allocate();
+        _input_gate_out6.allocator()->allocate();
+        _activation_input_gate.configure(&_input_gate_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    }
+
+    TensorShape cell_state1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+    _cell_state_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _cell_state_out2.allocator()->init(TensorInfo(cell_state1_shape, 1, input->info()->data_type()));
+    _cell_state_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _cell_state_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+    // Configure block that calculates the cell state
+    // cell_state = Clip((RixelwiseMul(input_gate, Activation(input * input_to_cell_weights + output_state * recurrent_to_cell_weights + cell_bias)) + PixelwiseMul(forget_gate, cell_state)), cell_threshold)
+    _memory_group.manage(&_cell_state_out1);
+    _fully_connected_cell_state.configure(input, input_to_cell_weights, cell_bias, &_cell_state_out1, true, false);
+    _memory_group.manage(&_cell_state_out2);
+    _transpose_cell_state1.configure(recurrent_to_cell_weights, &_cell_state_out2);
+    _memory_group.manage(&_cell_state_out3);
+    _gemm_cell_state1.configure(output_state, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
+    _cell_state_out2.allocator()->allocate();
+    _memory_group.manage(&_cell_state_out4);
+    _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+    _activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
+    _memory_group.manage(&_cell_state_out5);
+    _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _input_gate_out1.allocator()->allocate();
+    _cell_state_out4.allocator()->allocate();
+    _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _forget_gate_out1.allocator()->allocate();
+    _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+    _cell_state_out3.allocator()->allocate();
+    _cell_state_out5.allocator()->allocate();
+
+    // Perform clipping
+    if(cell_threshold != 0.f)
+    {
+        _perform_cell_clipping = true;
+        _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
+    }
+
+    TensorShape output1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+    TensorShape output2_shape = compute_transposed_shape(*cell_bias->info());
+    TensorShape output3_shape{ 1, output_state->info()->dimension(1) };
+    _output1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _output2.allocator()->init(TensorInfo(output1_shape, 1, input->info()->data_type()));
+    _output3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _output6.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+    // Configure block that calculates the output
+    // output_gate = Activation(input * input_to_output_weights + output_state * recurrent_to_output_weights + cell_state * cell_to_output_weights + output_gate_bias)
+    _memory_group.manage(&_output1);
+    _fully_connected_output.configure(input, input_to_output_weights, output_gate_bias, &_output1, true, false);
+    _memory_group.manage(&_output2);
+    _transpose_output1.configure(recurrent_to_output_weights, &_output2);
+    _memory_group.manage(&_output3);
+    _gemm_output1.configure(output_state, &_output2, nullptr, &_output3, 1.f, 0.f);
+    _output2.allocator()->allocate();
+    _memory_group.manage(&_output6);
+    _accum_output1.configure(&_output1, &_output3, &_output6, ConvertPolicy::SATURATE);
+    _output3.allocator()->allocate();
+    CLTensor *output_gate_out = &_output6;
+    if(lstm_params.has_peephole_opt())
+    {
+        _output4.allocator()->init(TensorInfo(output2_shape, 1, input->info()->data_type()));
+        _output5.allocator()->init(TensorInfo(output3_shape, 1, input->info()->data_type()));
+
+        _memory_group.manage(&_output4);
+        _transpose_output2.configure(lstm_params.cell_to_output_weights(), &_output4);
+        _memory_group.manage(&_output5);
+        _gemm_output2.configure(&_cell_state_out1, &_output4, nullptr, &_output5, 1.f, 0.f);
+        _accum_output2.configure(&_output6, &_output5, &_output1, ConvertPolicy::SATURATE);
+        _output6.allocator()->allocate();
+        output_gate_out = &_output1;
+
+        // Allocate intermediate buffers
+        _output4.allocator()->allocate();
+        _output5.allocator()->allocate();
+    }
+    else
+    {
+        _output1.allocator()->allocate();
+    }
+    _activation_output.configure(output_gate_out, output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    output_gate_out->allocator()->allocate();
+
+    _cell_state_activation.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+    // Configure block that calculates the output state
+    /** lstm_res = PixelwiseMul(output, Activation(cell_state))
+     *
+     *                      -- Clip(lstm_res * projection_weights + projection_bias, projection_threshold) , if there is a projection
+     *                     /
+     *  output_state =  --
+     *                     \
+     *                      -- lstm_res , otherwise
+     */
+    _memory_group.manage(&_cell_state_activation);
+    _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
+    _pixelwise_mul_output_state.configure(&_cell_state_activation, output, output_state, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _cell_state_activation.allocator()->allocate();
+
+    if(lstm_params.has_projection())
+    {
+        _has_projection_weights = true;
+        _output_projection1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _memory_group.manage(&_output_projection1);
+        _fully_connected_output_state.configure(output_state, lstm_params.projection_weights(), lstm_params.projection_bias(), &_output_projection1, true, false);
+        // Perform clipping
+        if(projection_threshold != 0.f)
+        {
+            _perform_projection_clipping = true;
+            _projection_clip.configure(&_output_projection1, output_state, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+        }
+
+        // Allocate intermediate buffer
+        _output_projection1.allocator()->allocate();
+    }
+
+    // Copy cell state and output
+    _copy_cell_state.configure(&_cell_state_out1, cell_state);
+    _cell_state_out1.allocator()->allocate();
+    _copy_output.configure(output_state, output);
+
+    // Vector for holding the tensors to store in scratch buffer
+    std::vector<ICLTensor *> scratch_inputs;
+    if(lstm_params.has_cifg_opt())
+    {
+        scratch_inputs.emplace_back(&_input_gate_out1);
+    }
+    scratch_inputs.emplace_back(&_cell_state_out1);
+    scratch_inputs.emplace_back(forget_gate_out);
+    scratch_inputs.emplace_back(output_gate_out);
+    _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+}
+
+Status CLLSTMLayer::validate(const ITensorInfo *input, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
+                             const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
+                             const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
+                             const ITensorInfo *output_state, const ITensorInfo *cell_state, const ITensorInfo *scratch_buffer, const ITensorInfo *output,
+                             const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                                        forget_gate_bias, cell_bias, output_gate_bias, output_state, cell_state);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                       recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state, cell_state);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_to_forget_weights->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_to_cell_weights->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_forget_weights->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_cell_weights->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->num_dimensions() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_gate_bias->num_dimensions() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_state->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_state->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(scratch_buffer->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+
+    if(lstm_params.has_peephole_opt())
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights(), lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() != 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_output_weights()->num_dimensions() != 1);
+    }
+
+    TensorShape      units_out_transposed_shape = compute_transposed_shape(*recurrent_to_output_weights);
+    TensorShape      gemmv_shape{ 1, output_state->dimension(1) };
+    TensorShape      num_units_transposed_shape = compute_transposed_shape(*forget_gate_bias);
+    const TensorInfo units_out_transposed_info  = TensorInfo(units_out_transposed_shape, 1, input->data_type());
+    const TensorInfo gemmv_shape_info           = TensorInfo(gemmv_shape, 1, input->data_type());
+    const TensorInfo num_units_transposed_info  = TensorInfo(num_units_transposed_shape, 1, input->data_type());
+
+    // Validate forget gate
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, cell_state, true, false));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state, &units_out_transposed_info, nullptr, cell_state, 1.f, 0.f, GEMMInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(cell_state, cell_state, cell_state, ConvertPolicy::SATURATE));
+    if(lstm_params.has_peephole_opt())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(cell_state, &num_units_transposed_info, nullptr, &gemmv_shape_info, 1.f, 0.f, GEMMInfo()));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(cell_state, &gemmv_shape_info, cell_state, ConvertPolicy::SATURATE));
+    }
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, cell_state, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+
+    // Validate input gate
+    if(!lstm_params.has_cifg_opt())
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.cell_to_input_weights(), lstm_params.input_gate_bias());
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() != 2);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() != 2);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() != 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() != 1);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), cell_state, true, false));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(cell_state, &num_units_transposed_info, nullptr, &gemmv_shape_info, 1.f, 0.f, GEMMInfo()));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(cell_state, &gemmv_shape_info, cell_state, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtractionKernel::validate(cell_state, cell_state, cell_state, ConvertPolicy::SATURATE));
+    }
+
+    // Validate cell state
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, cell_bias, cell_state, true, false));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, nullptr, activation_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state, cell_state, cell_state, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+
+    if(cell_threshold != 0.f)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold)));
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, cell_state, true, false));
+    if(lstm_params.has_peephole_opt())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(cell_state, cell_state, cell_state, ConvertPolicy::SATURATE));
+    }
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+
+    // Validate output state
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, cell_state, activation_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state, output, output_state, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    if(lstm_params.has_projection())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(output_state, lstm_params.projection_weights(), lstm_params.projection_bias(), cell_state, true, false));
+        if(projection_threshold != 0.f)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(cell_state, output_state, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
+                                                                                                                        projection_threshold)));
+        }
+    }
+
+    std::vector<TensorInfo> inputs_vector_info;
+    if(lstm_params.has_cifg_opt())
+    {
+        inputs_vector_info.emplace_back(*cell_state);
+    }
+    inputs_vector_info.emplace_back(*cell_state);
+    inputs_vector_info.emplace_back(*cell_state);
+    inputs_vector_info.emplace_back(*cell_state);
+
+    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    for(auto &input : inputs_vector_info)
+    {
+        inputs_vector_info_raw.emplace_back(&input);
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer));
+    return Status{};
+}
+
+void CLLSTMLayer::run()
+{
+    _memory_group.acquire();
+
+    _fully_connected_forget_gate.run();
+    CLScheduler::get().enqueue(_transpose_forget_gate1);
+    _gemm_forget_gate1.run();
+    CLScheduler::get().enqueue(_accum_forget_gate1);
+
+    if(_run_peephole_opt)
+    {
+        CLScheduler::get().enqueue(_transpose_forget_gate2);
+        _gemm_forget_gate2.run();
+        _accum_forget_gate2.run();
+    }
+    CLScheduler::get().enqueue(_activation_forget_gate);
+
+    if(_run_cifg_opt)
+    {
+        _ones.map(true);
+        std::fill_n(_ones.buffer(), _ones.info()->total_size(), 1);
+        _ones.unmap();
+        CLScheduler::get().enqueue(_subtract_input_gate);
+    }
+    else
+    {
+        _fully_connected_input_gate.run();
+        CLScheduler::get().enqueue(_transpose_input_gate1);
+        _gemm_input_gate1.run();
+        CLScheduler::get().enqueue(_transpose_input_gate2);
+        _gemm_input_gate2.run();
+        CLScheduler::get().enqueue(_accum_input_gate1);
+        _accum_input_gate2.run();
+        CLScheduler::get().enqueue(_activation_input_gate);
+    }
+
+    _fully_connected_cell_state.run();
+    CLScheduler::get().enqueue(_transpose_cell_state1);
+    _gemm_cell_state1.run();
+    CLScheduler::get().enqueue(_accum_cell_state1);
+    CLScheduler::get().enqueue(_activation_cell_state);
+    CLScheduler::get().enqueue(_pixelwise_mul_cell_state1);
+    CLScheduler::get().enqueue(_pixelwise_mul_cell_state2);
+    CLScheduler::get().enqueue(_accum_cell_state2);
+
+    if(_perform_cell_clipping)
+    {
+        CLScheduler::get().enqueue(_cell_clip);
+    }
+
+    _fully_connected_output.run();
+    CLScheduler::get().enqueue(_transpose_output1);
+    _gemm_output1.run();
+    CLScheduler::get().enqueue(_accum_output1);
+    CLScheduler::get().enqueue(_pixelwise_mul_output_state);
+
+    if(_run_peephole_opt)
+    {
+        CLScheduler::get().enqueue(_transpose_output2);
+        _gemm_output2.run();
+        _accum_output2.run();
+    }
+    CLScheduler::get().enqueue(_activation_output);
+
+    CLScheduler::get().enqueue(_activation_output_state);
+    CLScheduler::get().enqueue(_pixelwise_mul_output_state);
+
+    if(_has_projection_weights)
+    {
+        _fully_connected_output_state.run();
+        if(_perform_projection_clipping)
+        {
+            CLScheduler::get().enqueue(_projection_clip);
+        }
+    }
+
+    CLScheduler::get().enqueue(_copy_cell_state);
+    CLScheduler::get().enqueue(_copy_output);
+
+    _concat_scratch_buffer.run();
+
+    _memory_group.release();
+}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 9120aad..986fe00 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp

@@ -33,72 +33,120 @@
 
 using namespace arm_compute;
 
-CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
-      _is_first_run(false)
+namespace
 {
-}
-
-void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                      TensorShape &shape_wr, TensorShape &shape_im2col, TensorShape &shape_gemm)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(!conv_info.padding_is_symmetric());
+    ARM_COMPUTE_UNUSED(output);
 
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
-    }
+    const unsigned int kernel_width  = weights->dimension(0);
+    const unsigned int kernel_height = weights->dimension(1);
 
-    bool _has_bias = (biases != nullptr);
-    _is_first_run  = true;
-
-    // Get parameters for conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    unsigned int pad_x    = 0;
-    unsigned int pad_y    = 0;
-    std::tie(stride_x, stride_y) = conv_info.stride();
-    std::tie(pad_x, pad_y)       = conv_info.pad();
+    bool has_bias = (biases != nullptr);
 
     // Get convolved dimensions
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
                                                  conv_info);
 
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
-    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+    const size_t mat_weights_cols = weights->dimension(3);
+    const size_t mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + ((has_bias) ? 1 : 0);
+    const size_t mat_weights_num  = weights->dimension(4);
 
-    // Create tensor to store the reshaped weights
-    const size_t mat_weights_cols = weights->info()->dimension(3);
-    const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
-    const size_t mat_weights_num  = weights->info()->dimension(4);
+    shape_wr = TensorShape(mat_weights_cols, mat_weights_rows, mat_weights_num);
 
-    const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
-
-    _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
-
-    // Create tensor to store im2col reshaped inputs
     const size_t mat_input_cols = mat_weights_rows;
     const size_t mat_input_rows = conv_w * conv_h;
-    TensorShape  shape_im2col   = input->info()->tensor_shape();
+
+    shape_im2col = input->tensor_shape();
     shape_im2col.set(0, mat_input_cols);
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
 
-    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
-
-    // Create locally connected layer output tensor
-    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    shape_gemm = shape_im2col;
     shape_gemm.set(0, mat_weights_cols);
     shape_gemm.set(1, mat_input_rows);
+}
+} // namespace
+
+CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
+      _is_first_run(false), _original_weights(nullptr)
+{
+}
+
+Status CLLocallyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON(!conv_info.padding_is_symmetric());
+
+    bool has_bias = (biases != nullptr);
+
+    if(has_bias)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 2);
+    }
+
+    const unsigned int kernel_width  = weights->dimension(0);
+    const unsigned int kernel_height = weights->dimension(1);
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != conv_w) || (output->dimension(1) != conv_h), "Output shape does not match the expected one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+    // Calculate intermediate buffer shapes
+    TensorShape shape_wr;
+    TensorShape shape_im2col;
+    TensorShape shape_gemm;
+    calculate_shapes(input, weights, biases, output, conv_info, shape_wr, shape_im2col, shape_gemm);
+
+    TensorInfo weights_reshaped_info(shape_wr, 1, weights->data_type());
+    TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type());
+    TensorInfo gemm_output_info(shape_gemm, 1, input->data_type());
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLLocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(&gemm_output_info, output, std::make_pair(conv_w, conv_h)));
+
+    return Status{};
+}
+
+void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLLocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
+
+    bool _has_bias    = (biases != nullptr);
+    _original_weights = weights;
+    _is_first_run     = true;
+
+    const unsigned int kernel_width  = weights->info()->dimension(0);
+    const unsigned int kernel_height = weights->info()->dimension(1);
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
+
+    // Calculate intermediate buffer shapes
+    TensorShape shape_wr;
+    TensorShape shape_im2col;
+    TensorShape shape_gemm;
+    calculate_shapes(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info, shape_wr, shape_im2col, shape_gemm);
+
+    _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
+    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
     _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
 
     // Manage intermediate buffers
@@ -106,7 +154,7 @@
     _memory_group.manage(&_gemm_output);
 
     // Configure kernels
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(conv_w, conv_h), conv_info, _has_bias);
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
     _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
     _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
     _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
@@ -122,8 +170,13 @@
     // Run weights reshaping (Runs once for every configure)
     if(_is_first_run)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
         _is_first_run = false;
         CLScheduler::get().enqueue(_weights_reshape_kernel);
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
     }
 
     _memory_group.acquire();

diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index 146856c..55b7649 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp

@@ -39,6 +39,6 @@
 
 Status CLPermute::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(CLPermuteKernel::validate(input, output, perm));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(input, output, perm));
     return Status{};
-}
\ No newline at end of file
+}

diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 201bf87..17875a3 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp

@@ -41,13 +41,28 @@
     _kernel = std::move(k);
 
     // Configure border depending on operation required (quantize border in case of asymmetric data_type)
-    BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-    PixelValue zero_value(0.f);
+    BorderMode border_mode{};
+    PixelValue pixel_value(0.f);
     if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
     {
-        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+        pixel_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
     }
-    _border_handler.configure(input, _kernel->border_size(), border_mode, zero_value);
+    switch(input->info()->data_layout())
+    {
+        case DataLayout::NCHW:
+            border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+            break;
+        case DataLayout::NHWC:
+            border_mode = BorderMode::CONSTANT;
+            if(PoolingType::MAX == pool_info.pool_type() && !is_data_type_quantized_asymmetric(input->info()->data_type()))
+            {
+                pixel_value = PixelValue(std::numeric_limits<float>::lowest());
+            }
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data layout not supported");
+    }
+    _border_handler.configure(input, _kernel->border_size(), border_mode, pixel_value);
 }
 
 Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)

diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index ed1f51c..a13859c 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute;
@@ -33,8 +34,21 @@
 {
 }
 
+Status CLQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+    TensorInfo min_max{ input->num_channels(), input->data_type() };
+    ARM_COMPUTE_RETURN_ON_ERROR(CLMinMaxLayerKernel::validate(input, &min_max));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayerKernel::validate(input, output, &min_max));
+
+    return Status{};
+}
+
 void CLQuantizationLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
     // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel.
     _min_max_kernel.configure(input, &_min_max);
 

diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
new file mode 100644
index 0000000..4843ba6
--- /dev/null
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp

@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLRNNLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output()
+{
+}
+
+Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
+                            const ITensorInfo *output, const ActivationLayerInfo &info)
+{
+    const int idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const int idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != recurrent_weights->dimension(idx_width));
+    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != recurrent_weights->dimension(1));
+    ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
+    ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
+    ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
+
+    auto shape_info = TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info, true, false));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
+
+    return Status{};
+}
+
+void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output,
+                           ActivationLayerInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+
+    const int   idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    TensorShape shape      = compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+
+    _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+    _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+
+    // Manage intermediate buffers and configure
+    _memory_group.manage(&_fully_connected_out);
+    _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out, true, false);
+
+    _memory_group.manage(&_gemm_output);
+    _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
+
+    _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
+    _memory_group.manage(&_add_output);
+
+    _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+
+    _fully_connected_out.allocator()->allocate();
+    _gemm_output.allocator()->allocate();
+
+    _activation_kernel.configure(&_add_output, hidden_state, info);
+    _add_output.allocator()->allocate();
+
+    _copy_kernel.configure(hidden_state, output);
+}
+
+void CLRNNLayer::run()
+{
+    _memory_group.acquire();
+    _fully_connected_kernel.run();
+    _gemm_state_f.run();
+    CLScheduler::get().enqueue(_add_kernel);
+    CLScheduler::get().enqueue(_activation_kernel);
+
+    // copy hidden out to output
+    CLScheduler::get().enqueue(_copy_kernel);
+    _memory_group.release();
+}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index d02afb4..3a5133d 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,19 +35,64 @@
 
 using namespace arm_compute;
 
+namespace
+{
+unsigned int calculate_number_of_stages(const ITensorInfo *input)
+{
+    // Calculate number of WGs. 16 elements per thread, 8 threads per WG
+    const unsigned int num_of_wg = ceil(input->dimension(0) / 128.f);
+
+    // Calculate number of stages. First stage performs op and the rest reduction sum
+    // depending on the size of the input. Last stage should have only 1 WG.
+    const unsigned int num_of_stages = num_of_wg / 128 + 2;
+
+    return num_of_stages;
+}
+} // namespace
+
 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages()
 {
 }
 
+Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+    const unsigned int num_of_stages = calculate_number_of_stages(input);
+
+    // Create temporary tensor infos
+    auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
+
+    // Create intermediate tensor info
+    TensorShape shape{ input->tensor_shape() };
+
+    for(unsigned int i = 0; i < num_of_stages - 1; i++)
+    {
+        shape.set(0, ceil(shape.x() / 128.f));
+        sums_vector[i].set_data_type(input->data_type());
+        sums_vector[i].set_tensor_shape(shape);
+        sums_vector[i].set_num_channels(input->num_channels());
+        sums_vector[i].set_fixed_point_position(input->fixed_point_position());
+    }
+
+    // Validate ReductionOperation only on first kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, op));
+
+    // Validate ReductionOperation on intermediate stages
+    for(unsigned int i = 1; i < num_of_stages - 1; ++i)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, op));
+    }
+
+    // Validate ReductionOperation on the last stage
+    const unsigned int last_stage = num_of_stages - 1;
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, op));
+
+    return Status{};
+}
+
 void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
 {
-    // Calculate number of WGs. 16 elements per thread, 8 threads per WG
-    unsigned int num_of_wg = ceil(input->info()->dimension(0) / 128.f);
-
-    // Calculate number of stages. First stage performs op and the rest reduction sum
-    // depending on the size of the input. Last stage should have only 1 WG.
-    _num_of_stages = num_of_wg / 128 + 2;
+    _num_of_stages = calculate_number_of_stages(input->info());
 
     // Create temporary tensors
     _sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
@@ -95,4 +140,4 @@
     }
 
     _memory_group.release();
-}
\ No newline at end of file
+}

diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
new file mode 100644
index 0000000..d542781
--- /dev/null
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp

@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLWidthConcatenateLayer::CLWidthConcatenateLayer() // NOLINT
+    : _concat_kernels_vector(),
+      _num_inputs(0)
+{
+}
+
+Status CLWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output) // NOLINT
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
+
+    // Output auto inizialitation if not yet initialized
+    TensorInfo  tmp_output_info = *output->clone();
+    TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+    auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type(), inputs_vector[0]->fixed_point_position());
+
+    unsigned int width_offset = 0;
+    for(const auto &input : inputs_vector)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
+        width_offset += input->dimension(0);
+    }
+
+    return Status{};
+}
+
+void CLWidthConcatenateLayer::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
+{
+    _num_inputs = inputs_vector.size();
+
+    std::vector<ITensorInfo *> inputs_vector_info;
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
+    }
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position());
+    ARM_COMPUTE_ERROR_THROW_ON(CLWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
+
+    unsigned int width_offset = 0;
+
+    _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
+
+    for(unsigned int i = 0; i < _num_inputs; i++)
+    {
+        _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
+        width_offset += inputs_vector.at(i)->info()->dimension(0);
+    }
+}
+
+void CLWidthConcatenateLayer::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    for(unsigned i = 0; i < _num_inputs; i++)
+    {
+        CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+    }
+}

diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
new file mode 100644
index 0000000..49753ad
--- /dev/null
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp

@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims)
+{
+    Size2D output_tile = Size2D{};
+
+    if(kernel_dims == Size2D(3U, 3U))
+    {
+        output_tile = (input_dims.width <= 4 && input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+    }
+    else if(kernel_dims == Size2D(5U, 5U))
+    {
+        output_tile = Size2D(4U, 4U);
+    }
+
+    return output_tile;
+}
+
+bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
+{
+    // Check if we want to configure a Winograd configuration which requires fast math
+    using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+    std::vector<WinogradConfiguration> fast_math_winograd =
+    {
+        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
+    };
+
+    auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
+                            std::pair<int, int>(kernel_size.width, kernel_size.height));
+
+    return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
+}
+} // namespace
+
+CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _activationlayer_function(), _input0(), _input1(), _batched_mm_output(),
+      _original_weights(nullptr), _is_prepared(false), _is_activationlayer_enabled(false)
+{
+}
+
+void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
+                                           bool enable_fast_math)
+{
+    // Get indices for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+    const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+    // Check if the Winograd configuration requires fast math
+    if(!enable_fast_math)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+    }
+
+    const WinogradInfo winograd_info = WinogradInfo(output_tile,
+                                                    kernel_size,
+                                                    input_dims,
+                                                    conv_info,
+                                                    input->info()->data_layout());
+
+    _is_prepared      = false;
+    _original_weights = weights;
+
+    // Manage intermediate tensors
+    _memory_group.manage(&_input0);
+    _memory_group.manage(&_batched_mm_output);
+
+    // Do not manage _input1 as it contains the weights
+
+    // Configure input transform
+    _input_transform.configure(input, &_input0, winograd_info);
+
+    // Configure filter transform
+    _filter_transform.configure(weights, &_input1, winograd_info);
+
+    // Configure batched matrix multiply
+    _batched_mm.configure(&_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+
+    // Configure output transform
+    _output_transform.configure(&_batched_mm_output, biases, output, winograd_info);
+
+    // Configure activation layer
+    _is_activationlayer_enabled = act_info.enabled();
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
+
+    // Allocate temporary tensors
+    _input0.allocator()->allocate();
+    _batched_mm_output.allocator()->allocate();
+}
+
+Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                            const ActivationLayerInfo &act_info, bool enable_fast_math)
+{
+    // Get indeces for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D input_dims  = Size2D(input->tensor_shape()[idx_width], input->tensor_shape()[idx_height]);
+    const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+    // Check if the Winograd configuration requires fast math
+    if(!enable_fast_math)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+    }
+
+    const WinogradInfo winograd_info = WinogradInfo(output_tile,
+                                                    kernel_size,
+                                                    input_dims,
+                                                    conv_info,
+                                                    input->data_layout());
+
+    // Validate input transform
+    const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+    const TensorInfo  input0       = input->clone()->set_tensor_shape(input0_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransform::validate(input, &input0, winograd_info));
+
+    // Validate filter transform
+    const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
+    const TensorInfo  input1       = weights->clone()->set_tensor_shape(input1_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradFilterTransformKernel::validate(weights, &input1, winograd_info));
+
+    // Validate batched matrix multiply
+    TensorShape batched_mm_output_shape = input0.tensor_shape();
+    batched_mm_output_shape[0]          = input1.tensor_shape()[0];
+    const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/)));
+
+    // Configure output transform
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradOutputTransformKernel::validate(&batched_mm_output, biases, output, winograd_info));
+
+    // Validate Activation Layer
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+    }
+
+    return Status{};
+}
+
+void CLWinogradConvolutionLayer::run()
+{
+    prepare();
+
+    _memory_group.acquire();
+
+    // Run input transform
+    _input_transform.run();
+
+    // Run batched matrix multiplication
+    _batched_mm.run();
+
+    // Run output transform
+    CLScheduler::get().enqueue(_output_transform);
+
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
+
+    _memory_group.release();
+}
+
+void CLWinogradConvolutionLayer::prepare()
+{
+    if(!_is_prepared)
+    {
+        // Run filter transform and mark original weights as unused
+        _input1.allocator()->allocate();
+        CLScheduler::get().enqueue(_filter_transform, false);
+        _original_weights->mark_as_unused();
+
+        // Prepare GEMM and release reshaped weights if marked unused by CLGEMM
+        _batched_mm.prepare();
+        if(!_input1.is_used())
+        {
+            _input1.allocator()->free();
+        }
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
+}

diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
new file mode 100644
index 0000000..09e8456
--- /dev/null
+++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp

@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
+#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLWinogradInputTransform::configure(ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLWinogradInputTransformKernel>();
+    k->configure(input, output, winograd_info);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+}
+
+Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransformKernel::validate(input, output, winograd_info));
+    return Status{};
+}

diff --git a/src/runtime/CL/tuners/BifrostTuner.cpp b/src/runtime/CL/tuners/BifrostTuner.cpp
new file mode 100644
index 0000000..c0ebd24
--- /dev/null
+++ b/src/runtime/CL/tuners/BifrostTuner.cpp

@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/tuners/BifrostTuner.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernels.h"
+#include "arm_compute/core/utils/misc/Cast.h"
+
+namespace arm_compute
+{
+namespace tuners
+{
+namespace
+{
+/** Tunes a @ref CLDirectConvolutionLayerKernel for a bifrost target
+ *
+ * @param[in] k Kernels to tune
+ */
+void tune_direct_convolution_kernel(CLDirectConvolutionLayerKernel &k)
+{
+    cl::NDRange lws_hint = k.lws_hint();
+
+    const GPUTarget    gpu_target    = k.get_target();
+    const DataType     dt            = k._input->info()->data_type();
+    const TensorShape  weights_shape = k._weights->info()->tensor_shape();
+    const TensorShape  inputs_shape  = k._input->info()->tensor_shape();
+    const size_t       kernel_size   = weights_shape.x();
+    const unsigned int stride_x      = k._conv_stride_x;
+    const unsigned int stride_y      = k._conv_stride_y;
+
+    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && (kernel_size <= 5) && (stride_x == 1) && (stride_y == 1) && (dt == DataType::F32))
+    {
+        // Through extensive experimentation with over 30 representative tensor
+        // shapes, we found a small number of local work size configurations
+        // that result in nearly optimal execution times. Selecting the right
+        // lws for a given shape, however, required a complex decision tree,
+        // until we constructed a simple feature as described below.
+        //
+        // We started from the number of multiply-accumulate operations for a
+        // convolution layer, which is equal to the product of the input
+        // dimensions 0..2 and the weights dimensions 0..2.  Unfortunately,
+        // this resulted in ties between distinct shapes that required distinct
+        // lws configurations. Replacing the width of the input with the kernel
+        // size, however, resulted in nearly optimal predictions. We use underscores
+        // in variable names to indicate when they are intentionally misleading.
+        const size_t product_of_weights_dimensions = weights_shape[0] * weights_shape[1] * weights_shape[2];
+        const size_t product_of_input_dimensions_  = inputs_shape[0] * inputs_shape[1] * inputs_shape[2];
+        const float  mega_ops_                     = 1e-6 * product_of_weights_dimensions * product_of_input_dimensions_;
+
+        switch(kernel_size)
+        {
+            case 1:
+            {
+                if(mega_ops_ < 1.f)
+                {
+                    lws_hint = cl::NDRange(1, 1, 8);
+                }
+                else if(mega_ops_ < 7.f)
+                {
+                    lws_hint = cl::NDRange(1, 1, 4);
+                }
+                else
+                {
+                    lws_hint = cl::NDRange(1, 1, 2);
+                }
+                break;
+            }
+            case 3:
+            {
+                if(mega_ops_ < 1.f)
+                {
+                    lws_hint = cl::NDRange(1, 1, 8);
+                }
+                else if(mega_ops_ < 13.f)
+                {
+                    lws_hint = cl::NDRange(2, 1, 4);
+                }
+                else if(mega_ops_ < 50.f)
+                {
+                    lws_hint = cl::NDRange(3, 1, 4);
+                }
+                else
+                {
+                    lws_hint = cl::NDRange(2, 1, 6);
+                }
+                break;
+            }
+            case 5:
+            {
+                if(mega_ops_ < 2.f || mega_ops_ > 80.f)
+                {
+                    lws_hint = cl::NDRange(2, 1, 4);
+                }
+                else
+                {
+                    lws_hint = cl::NDRange(2, 1, 8);
+                }
+                break;
+            }
+            default:
+                break;
+        }
+        k.set_lws_hint(lws_hint);
+    }
+}
+} // namespace
+
+void BifrostTuner::tune_kernel_static(ICLKernel &kernel)
+{
+    // Continue on tuning if dynamic tuning
+    if(dynamic_cast<CLDirectConvolutionLayerKernel *>(&kernel) != nullptr)
+    {
+        tune_direct_convolution_kernel(*utils::cast::polymorphic_downcast<CLDirectConvolutionLayerKernel *>(&kernel));
+    }
+}
+
+void BifrostTuner::tune_kernel_dynamic(ICLKernel &kernel)
+{
+    ARM_COMPUTE_UNUSED(kernel);
+}
+} // namespace tuners
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 168ed6e..92dce34 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CPUUtils.h"
 
 #include <condition_variable>
 #include <iostream>
@@ -159,6 +160,7 @@
     : _num_threads(num_threads_hint()),
       _threads(_num_threads - 1)
 {
+    get_cpu_configuration(_cpu_info);
 }
 
 void CPPScheduler::set_num_threads(unsigned int num_threads)
@@ -178,7 +180,7 @@
 
     /** [Scheduler example] */
     ThreadInfo info;
-    info.cpu_info = _info;
+    info.cpu_info = &_cpu_info;
 
     const Window      &max_window     = kernel->window();
     const unsigned int num_iterations = max_window.num_iterations(split_dimension);

diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index c8285b4..2adc14c 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,7 @@
 {
     ARM_COMPUTE_UNUSED(split_dimension);
     ThreadInfo info;
-    info.cpu_info = cpu_info();
+    info.cpu_info = &_cpu_info;
     kernel->run(kernel->window(), info);
 }
 

diff --git a/src/graph/CL/CLUnmap.cpp b/src/runtime/CPP/functions/CPPUpsample.cpp
similarity index 65%
copy from src/graph/CL/CLUnmap.cpp
copy to src/runtime/CPP/functions/CPPUpsample.cpp
index 31f2f19..619b7e1 100644
--- a/src/graph/CL/CLUnmap.cpp
+++ b/src/runtime/CPP/functions/CPPUpsample.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,23 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/CL/CLUnmap.h"
+#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/ITensorObject.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
+#include "support/ToolchainSupport.h"
 
-using namespace arm_compute::graph;
+using namespace arm_compute;
 
-CLUnmap::CLUnmap(ITensorObject *tensor)
-    : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor()))
+void CPPUpsample::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info, unsigned int inner_border_right, unsigned int inner_border_top)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
-}
-
-void CLUnmap::run()
-{
-    _tensor->unmap(arm_compute::CLScheduler::get().queue());
-}
+    auto k = arm_compute::support::cpp14::make_unique<CPPUpsampleKernel>();
+    k->configure(input, output, info, inner_border_right, inner_border_top);
+    _kernel = std::move(k);
+}
\ No newline at end of file

diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
new file mode 100644
index 0000000..7e8bf2b
--- /dev/null
+++ b/src/runtime/CPUUtils.cpp

@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPUUtils.h"
+
+#include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
+
+#include <array>
+#include <cstdlib>
+#include <cstring>
+#include <fcntl.h>
+#include <fstream>
+#include <map>
+#include <sched.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#ifndef BARE_METAL
+#include <regex>
+#include <thread>
+#endif /* BARE_METAL */
+
+#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+#include <sys/auxv.h>
+
+/* Get HWCAP bits from asm/hwcap.h */
+#include <asm/hwcap.h>
+#endif /* !BARE_METAL */
+
+/* Make sure the bits we care about are defined, just in case asm/hwcap.h is
+ * out of date (or for bare metal mode) */
+#ifndef HWCAP_ASIMDHP
+#define HWCAP_ASIMDHP (1 << 10)
+#endif /* HWCAP_ASIMDHP */
+
+#ifndef HWCAP_CPUID
+#define HWCAP_CPUID (1 << 11)
+#endif /* HWCAP_CPUID */
+
+#ifndef HWCAP_ASIMDDP
+#define HWCAP_ASIMDDP (1 << 20)
+#endif /* HWCAP_ASIMDDP */
+
+namespace
+{
+using namespace arm_compute;
+
+#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+struct PerCPUData
+{
+    CPUModel     model     = CPUModel::GENERIC;
+    unsigned int midr      = 0;
+    bool         model_set = false;
+};
+
+/* Convert an MIDR register value to a CPUModel enum value. */
+CPUModel midr_to_model(const unsigned int midr)
+{
+    CPUModel model;
+
+    // Unpack variant and CPU ID
+    const int variant = (midr >> 20) & 0xF;
+    const int cpunum  = (midr >> 4) & 0xFFF;
+
+    // Only CPUs we have code paths for are detected.  All other CPUs can be safely classed as "GENERIC"
+    switch(cpunum)
+    {
+        case 0xd03:
+            model = CPUModel::A53;
+            break;
+
+        case 0xd05:
+            if(variant != 0)
+            {
+                model = CPUModel::A55r1;
+            }
+            else
+            {
+                model = CPUModel::A55r0;
+            }
+            break;
+
+        default:
+            model = CPUModel::GENERIC;
+            break;
+    }
+
+    return model;
+}
+
+void populate_models_cpuid(std::vector<PerCPUData> &cpusv)
+{
+    // If the CPUID capability is present, MIDR information is provided in /sys. Use that to populate the CPU model table.
+    uint32_t i = 0;
+    for(auto &c : cpusv)
+    {
+        std::stringstream str;
+        str << "/sys/devices/system/cpu/cpu" << i++ << "/regs/identification/midr_el1";
+        std::ifstream file;
+        file.open(str.str(), std::ios::in);
+        if(file.is_open())
+        {
+            std::string line;
+            if(bool(getline(file, line)))
+            {
+                const unsigned long midr = support::cpp11::stoul(line, nullptr, 16);
+                c.midr                   = (midr & 0xffffffff);
+                c.model                  = midr_to_model(c.midr);
+                c.model_set              = true;
+            }
+        }
+    }
+}
+
+void populate_models_cpuinfo(std::vector<PerCPUData> &cpusv)
+{
+    // If "long-form" cpuinfo is present, parse that to populate models.
+    std::regex proc_regex("^processor.*(\\d+)$");
+    std::regex imp_regex("^CPU implementer.*0x(..)$");
+    std::regex var_regex("^CPU variant.*0x(.)$");
+    std::regex part_regex("^CPU part.*0x(...)$");
+    std::regex rev_regex("^CPU revision.*(\\d+)$");
+
+    std::ifstream file;
+    file.open("/proc/cpuinfo", std::ios::in);
+
+    if(file.is_open())
+    {
+        std::string line;
+        int         midr   = 0;
+        int         curcpu = -1;
+
+        while(bool(getline(file, line)))
+        {
+            std::smatch match;
+
+            if(std::regex_match(line, match, proc_regex))
+            {
+                std::string id     = match[1];
+                int         newcpu = support::cpp11::stoi(id, nullptr, 0);
+
+                if(curcpu >= 0 && midr == 0)
+                {
+                    // Matched a new CPU ID without any description of the previous one - looks like old format.
+                    return;
+                }
+
+                if(curcpu >= 0)
+                {
+                    cpusv[curcpu].midr      = midr;
+                    cpusv[curcpu].model     = midr_to_model(midr);
+                    cpusv[curcpu].model_set = true;
+                }
+
+                midr   = 0;
+                curcpu = newcpu;
+
+                continue;
+            }
+
+            if(std::regex_match(line, match, imp_regex))
+            {
+                int impv = support::cpp11::stoi(match[1], nullptr, 16);
+                midr |= (impv << 24);
+                continue;
+            }
+
+            if(std::regex_match(line, match, var_regex))
+            {
+                int varv = support::cpp11::stoi(match[1], nullptr, 16);
+                midr |= (varv << 16);
+                continue;
+            }
+
+            if(std::regex_match(line, match, part_regex))
+            {
+                int partv = support::cpp11::stoi(match[1], nullptr, 16);
+                midr |= (partv << 4);
+                continue;
+            }
+
+            if(std::regex_match(line, match, rev_regex))
+            {
+                int regv = support::cpp11::stoi(match[1], nullptr, 10);
+                midr |= (regv);
+                midr |= (0xf << 16);
+                continue;
+            }
+        }
+
+        if(curcpu >= 0)
+        {
+            cpusv[curcpu].midr      = midr;
+            cpusv[curcpu].model     = midr_to_model(midr);
+            cpusv[curcpu].model_set = true;
+        }
+    }
+}
+
+int get_max_cpus()
+{
+    int max_cpus = 1;
+#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+    std::ifstream CPUspresent;
+    CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in);
+    bool success = false;
+
+    if(CPUspresent.is_open())
+    {
+        std::string line;
+
+        if(bool(getline(CPUspresent, line)))
+        {
+            /* The content of this file is a list of ranges or single values, e.g.
+                 * 0-5, or 1-3,5,7 or similar.  As we are interested in the
+                 * max valid ID, we just need to find the last valid
+                 * delimiter ('-' or ',') and parse the integer immediately after that.
+                 */
+            auto startfrom = line.begin();
+
+            for(auto i = line.begin(); i < line.end(); ++i)
+            {
+                if(*i == '-' || *i == ',')
+                {
+                    startfrom = i + 1;
+                }
+            }
+
+            line.erase(line.begin(), startfrom);
+
+            max_cpus = support::cpp11::stoi(line, nullptr, 0) + 1;
+            success  = true;
+        }
+    }
+
+    // Return std::thread::hardware_concurrency() as a fallback.
+    if(!success)
+    {
+        max_cpus = std::thread::hardware_concurrency();
+    }
+#endif /* BARE_METAL */
+
+    return max_cpus;
+}
+#endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
+
+} // namespace
+
+namespace arm_compute
+{
+void get_cpu_configuration(CPUInfo &cpuinfo)
+{
+#if !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__))
+    bool cpuid        = false;
+    bool fp16_support = false;
+    bool dot_support  = false;
+
+    const uint32_t hwcaps = getauxval(AT_HWCAP);
+
+    if((hwcaps & HWCAP_CPUID) != 0)
+    {
+        cpuid = true;
+    }
+
+    if((hwcaps & HWCAP_ASIMDHP) != 0)
+    {
+        fp16_support = true;
+    }
+
+    if((hwcaps & HWCAP_ASIMDDP) != 0)
+    {
+        dot_support = true;
+    }
+
+#ifdef __aarch64__
+    /* Pre-4.15 kernels don't have the ASIMDDP bit.
+     *
+     * Although the CPUID bit allows us to read the feature register
+     * directly, the kernel quite sensibly masks this to only show
+     * features known by it to be safe to show to userspace.  As a
+     * result, pre-4.15 kernels won't show the relevant bit in the
+     * feature registers either.
+     *
+     * So for now, use a whitelist of CPUs known to support the feature.
+     */
+    if(!dot_support && cpuid)
+    {
+        /* List of CPUs with dot product support:         A55r1       A75r1       A75r2  */
+        const unsigned int dotprod_whitelist_masks[]  = { 0xfff0fff0, 0xfff0fff0, 0xfff0fff0, 0 };
+        const unsigned int dotprod_whitelist_values[] = { 0x4110d050, 0x4110d0a0, 0x4120d0a0, 0 };
+
+        unsigned long cpuid;
+
+        __asm __volatile(
+            "mrs %0, midr_el1\n"
+            : "=r"(cpuid)
+            :
+            : );
+
+        for(int i = 0; dotprod_whitelist_values[i] != 0; i++)
+        {
+            if((cpuid & dotprod_whitelist_masks[i]) == dotprod_whitelist_values[i])
+            {
+                dot_support = true;
+                break;
+            }
+        }
+    }
+#endif /* __aarch64__ */
+    const unsigned int max_cpus = get_max_cpus();
+    cpuinfo.set_cpu_num(max_cpus);
+    cpuinfo.set_fp16(fp16_support);
+    cpuinfo.set_dotprod(dot_support);
+    std::vector<PerCPUData> percpu(max_cpus);
+    if(cpuid)
+    {
+        populate_models_cpuid(percpu);
+    }
+    else
+    {
+        populate_models_cpuinfo(percpu);
+    }
+    int j(0);
+    for(const auto &v : percpu)
+    {
+        cpuinfo.set_cpu_model(j++, v.model);
+    }
+#else  /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
+    ARM_COMPUTE_UNUSED(cpuinfo);
+#endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
+}
+
+unsigned int get_threads_hint()
+{
+    unsigned int num_threads_hint = 1;
+
+#ifndef BARE_METAL
+    std::map<std::string, unsigned int> cpu_part_occurrence_map;
+
+    // CPU part regex
+    std::regex  cpu_part_rgx(R"(.*CPU part.+?(?=:).+?(?=\w+)(\w+).*)");
+    std::smatch cpu_part_match;
+
+    // Read cpuinfo and get occurrence of each core
+    std::ifstream cpuinfo;
+    cpuinfo.open("/proc/cpuinfo", std::ios::in);
+    if(cpuinfo.is_open())
+    {
+        std::string line;
+        while(bool(getline(cpuinfo, line)))
+        {
+            if(std::regex_search(line.cbegin(), line.cend(), cpu_part_match, cpu_part_rgx))
+            {
+                std::string cpu_part = cpu_part_match[1];
+                if(cpu_part_occurrence_map.find(cpu_part) != cpu_part_occurrence_map.end())
+                {
+                    cpu_part_occurrence_map[cpu_part]++;
+                }
+                else
+                {
+                    cpu_part_occurrence_map[cpu_part] = 1;
+                }
+            }
+        }
+    }
+
+    // Get min number of threads
+    auto min_common_cores = std::min_element(cpu_part_occurrence_map.begin(), cpu_part_occurrence_map.end(),
+                                             [](const std::pair<std::string, unsigned int> &p1, const std::pair<std::string, unsigned int> &p2)
+    {
+        return p1.second < p2.second;
+    });
+
+    // Set thread hint
+    num_threads_hint = cpu_part_occurrence_map.empty() ? std::thread::hardware_concurrency() : min_common_cores->second;
+#endif /* BARE_METAL */
+
+    return num_threads_hint;
+}
+
+} // namespace arm_compute

diff --git a/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp b/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp
new file mode 100644
index 0000000..cdd12c3
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp

@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+
+#include <cstddef>
+
+namespace arm_compute
+{
+void *GCBufferAllocator::allocate(size_t size, size_t alignment)
+{
+    ARM_COMPUTE_UNUSED(alignment);
+    auto *gl_buffer = new GLBufferWrapper();
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, gl_buffer->_ssbo_name));
+    ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(size), nullptr, GL_STATIC_DRAW));
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+
+    return reinterpret_cast<void *>(gl_buffer);
+}
+
+void GCBufferAllocator::free(void *ptr)
+{
+    ARM_COMPUTE_ERROR_ON(ptr == nullptr);
+    auto *gl_buffer = reinterpret_cast<GLBufferWrapper *>(ptr);
+    delete gl_buffer;
+}
+
+std::unique_ptr<IMemoryRegion> GCBufferAllocator::make_region(size_t size, size_t alignment)
+{
+    ARM_COMPUTE_UNUSED(size, alignment);
+    return nullptr;
+}
+} // namespace arm_compute

diff --git a/src/runtime/GLES_COMPUTE/GCScheduler.cpp b/src/runtime/GLES_COMPUTE/GCScheduler.cpp
index fcc8559..f781273 100644
--- a/src/runtime/GLES_COMPUTE/GCScheduler.cpp
+++ b/src/runtime/GLES_COMPUTE/GCScheduler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 
 #include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
 
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
 
 using namespace arm_compute;
@@ -31,7 +32,7 @@
 std::once_flag GCScheduler::_initialize_symbols;
 
 GCScheduler::GCScheduler()
-    : _display(EGL_NO_DISPLAY), _context(EGL_NO_CONTEXT)
+    : _display(EGL_NO_DISPLAY), _context(EGL_NO_CONTEXT), _target(GPUTarget::MIDGARD)
 {
 }
 
@@ -48,11 +49,13 @@
 {
     setup_context();
 
-    GCKernelLibrary::get().init("./cs_shaders/", _display, _context);
+    init(_display, _context);
 }
 
 void GCScheduler::init(EGLDisplay dpy, EGLContext ctx)
 {
+    _target = get_target_from_device();
+
     GCKernelLibrary::get().init("./cs_shaders/", dpy, ctx);
 }
 

diff --git a/src/runtime/GLES_COMPUTE/GCTensor.cpp b/src/runtime/GLES_COMPUTE/GCTensor.cpp
index edbd16d..e193d26 100644
--- a/src/runtime/GLES_COMPUTE/GCTensor.cpp
+++ b/src/runtime/GLES_COMPUTE/GCTensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,7 @@
 using namespace arm_compute;
 
 GCTensor::GCTensor()
-    : _allocator()
+    : _allocator(this)
 {
 }
 

diff --git a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
index 694b34f..abd2b48 100644
--- a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
+++ b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,11 +31,16 @@
 
 using namespace arm_compute;
 
-GCTensorAllocator::GCTensorAllocator()
-    : _gl_buffer(), _mapping(nullptr)
+GCTensorAllocator::GCTensorAllocator(GCTensor *owner)
+    : _associated_memory_group(nullptr), _gl_buffer(), _mapping(nullptr), _owner(owner)
 {
 }
 
+GCTensorAllocator::~GCTensorAllocator()
+{
+    _gl_buffer = support::cpp14::make_unique<GLBufferWrapper>();
+}
+
 uint8_t *GCTensorAllocator::data()
 {
     return _mapping;
@@ -43,17 +48,35 @@
 
 void GCTensorAllocator::allocate()
 {
-    _gl_buffer = support::cpp14::make_unique<GLBufferWrapper>();
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name));
-    ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(info().total_size()), nullptr, GL_STATIC_DRAW));
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+    if(_associated_memory_group == nullptr)
+    {
+        _gl_buffer = support::cpp14::make_unique<GLBufferWrapper>();
+        ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name));
+        ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(info().total_size()), nullptr, GL_STATIC_DRAW));
+        ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+    }
+    else
+    {
+        _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(&_gl_buffer), info().total_size());
+    }
     info().set_is_resizable(false);
 }
 
 void GCTensorAllocator::free()
 {
-    _gl_buffer.reset();
-    info().set_is_resizable(true);
+    if(_associated_memory_group == nullptr)
+    {
+        _gl_buffer.reset();
+        info().set_is_resizable(true);
+    }
+}
+
+void GCTensorAllocator::set_associated_memory_group(GCMemoryGroup *associated_memory_group)
+{
+    ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
+    ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
+    ARM_COMPUTE_ERROR_ON(_gl_buffer.get() != nullptr);
+    _associated_memory_group = associated_memory_group;
 }
 
 uint8_t *GCTensorAllocator::lock()

diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index 1d2370e..2a710f7 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp

@@ -37,14 +37,14 @@
 using namespace arm_compute;
 
 GCConvolutionLayerReshapeWeights::GCConvolutionLayerReshapeWeights()
-    : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+    : _weights_reshape_kernel(), _weights_reshaped()
 {
 }
 
-void GCConvolutionLayerReshapeWeights::configure(const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, bool transpose1xW)
+void GCConvolutionLayerReshapeWeights::configure(const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
 
     if(biases != nullptr)
@@ -56,73 +56,66 @@
     }
 
     const bool       append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
-    const unsigned   bias_element  = (append_biases) ? 1 : 0;
     const IGCTensor *biases_to_use = (append_biases) ? biases : nullptr;
 
-    _transpose1xW = transpose1xW;
-
-    if(transpose1xW)
-    {
-        // Create tensor to store the reshaped weights
-        const unsigned int mat_weights_cols = weights->info()->dimension(3);
-        const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
-        TensorShape        shape_wr(mat_weights_cols, mat_weights_rows);
-        const DataType     dt                   = weights->info()->data_type();
-        const int          fixed_point_position = weights->info()->fixed_point_position();
-        TensorInfo         info_wr(shape_wr, 1, dt, fixed_point_position);
-
-        _weights_reshaped.allocator()->init(info_wr);
-        _weights_reshape_kernel.configure(weights, biases_to_use, &_weights_reshaped);
-        _weights_transposed_kernel.configure(&_weights_reshaped, output);
-        _weights_reshaped.allocator()->allocate();
-    }
-    else
-    {
-        _weights_reshape_kernel.configure(weights, biases_to_use, output);
-    }
+    _weights_reshape_kernel.configure(weights, biases_to_use, output);
 }
 
 void GCConvolutionLayerReshapeWeights::run()
 {
     GCScheduler::get().dispatch(_weights_reshape_kernel);
-    if(_transpose1xW)
-    {
-        GCScheduler::get().dispatch(_weights_transposed_kernel);
-    }
 }
 
-GCConvolutionLayer::GCConvolutionLayer()
-    : _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _fill_border(), _input_im2col_reshaped(), _input_interleaved_reshaped(),
-      _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _append_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+GCConvolutionLayer::GCConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reshape_weights(), _input_im2col_kernel(), _mm_gemm(), _output_col2im_kernel(), _fill_border(), _activationlayer_function(), _original_weights(nullptr),
+      _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_first_run(true), _is_activationlayer_enabled(false)
 {
 }
 
-void GCConvolutionLayer::configure_mm(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output, bool is_interleaved_transposed)
+void GCConvolutionLayer::configure_mm(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output)
 {
-    _mm_kernel.configure(input, weights, output, 1.f, is_interleaved_transposed);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info()));
+
+    _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */));
 }
 
-void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+Status GCConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output)
 {
+    // Perform validation step on Matrix multiply function
+    GCGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */));
+    return Status{};
+}
+
+void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
 
+    _is_first_run     = true;
+    _original_weights = weights;
+
     if(biases != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
         ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
     }
 
     const DataType dt = input->info()->data_type();
 
-    _append_bias          = (biases != nullptr);
-    _are_weights_reshaped = weights_info.are_reshaped();
+    // Set the GPU target for im2col and col2im
+    _input_im2col_kernel.set_target(GCScheduler::get().get_target());
+    _output_col2im_kernel.set_target(GCScheduler::get().get_target());
 
-    const unsigned   bias_element  = (_append_bias) ? 1 : 0;
-    const IGCTensor *biases_to_use = (_append_bias) ? biases : nullptr;
+    const bool       append_bias   = (biases != nullptr);
+    const unsigned   bias_element  = (append_bias) ? 1 : 0;
+    const IGCTensor *biases_to_use = (append_bias) ? biases : nullptr;
 
     // Get parameters from conv_info
     unsigned int stride_x = 0;
@@ -133,57 +126,19 @@
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
 
-    const unsigned int kernel_width  = (_are_weights_reshaped) ? weights_info.kernel_size().first : weights->info()->dimension(0);
-    const unsigned int kernel_height = (_are_weights_reshaped) ? weights_info.kernel_size().second : weights->info()->dimension(1);
+    const unsigned int kernel_width  = weights->info()->dimension(0);
+    const unsigned int kernel_height = weights->info()->dimension(1);
     std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
-                                                 conv_info);
-
-    // Check if its a "fully connected" convolution
-    _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
-    const bool run_interleaved      = (!_is_fully_connected_convolution);
+                                                 conv_info, dilation);
 
     unsigned int mat_weights_cols = weights->info()->dimension(3);
     unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
 
-    // Reshape weights if needed
-    if(_are_weights_reshaped)
-    {
-        if(_is_fully_connected_convolution)
-        {
-            mat_weights_cols = weights->info()->dimension(0);
-            mat_weights_rows = weights->info()->dimension(1);
-        }
-        else
-        {
-            mat_weights_cols                         = weights_info.num_kernels();
-            const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
-            mat_weights_rows                         = quarter_reshaped_cols + bias_element;
-        }
-    }
-    else
-    {
-        if(_is_fully_connected_convolution)
-        {
-            // Create tensor to store the reshaped weights
-            int num_elems_read_per_iteration_x = 1;
-            if(dt == DataType::F16)
-            {
-                num_elems_read_per_iteration_x = 2;
-            }
-            TensorShape shape_wr((ceil_to_multiple(mat_weights_cols, num_elems_read_per_iteration_x)), mat_weights_rows);
-            _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_wr));
-            _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, false /* 1xW transpose */);
-        }
-        else
-        {
-            // Create tensor to store transposed weights
-            const float transpose_width = 16.0f / input->info()->element_size();
-            TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
-            _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_wt));
-            _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, true /* 1xW transpose */);
-        }
-        weights = &_weights_reshaped;
-    }
+    // _weights_reshaped will be auto configured in the kernel.
+    // Just append biases and do not transpose 1xW as it will be reshaped in GCGEMM
+    _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped);
+
+    weights = &_weights_reshaped;
 
     // Create tensor to store im2col reshaped inputs
     const unsigned int mat_input_cols = mat_weights_rows;
@@ -195,17 +150,7 @@
 
     TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->info()->fixed_point_position());
     _input_im2col_reshaped.allocator()->init(im2col_reshaped_info);
-
-    // Create tensor (interleave) to prepare input tensor for GEMM
-    if(run_interleaved)
-    {
-        TensorShape shape_interleaved = shape_im2col;
-        shape_interleaved.set(0, shape_interleaved.x() * 4);
-        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
-
-        TensorInfo interleaved_info(shape_interleaved, 1, dt, input->info()->fixed_point_position());
-        _input_interleaved_reshaped.allocator()->init(interleaved_info);
-    }
+    _memory_group.manage(&_input_im2col_reshaped);
 
     // Create GEMM output tensor
     TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
@@ -215,27 +160,20 @@
 
     TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
     _gemm_output.allocator()->init(info_gemm);
+    _memory_group.manage(&_gemm_output);
 
-    // Configure kernels
     if(dt == DataType::F16)
     {
         BorderSize border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
         input->info()->extend_padding(border_size);
         _fill_border.configure(input, border_size, BorderMode::CONSTANT, PixelValue(0)); // for PAD of im2col fp16: consider it as border
     }
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias);
+    // Configure im2col
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation);
 
-    // Configure matrix multiply
-    if(run_interleaved)
-    {
-        _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
-        configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output);
-        _input_interleaved_reshaped.allocator()->allocate();
-    }
-    else
-    {
-        configure_mm(&_input_im2col_reshaped, weights, &_gemm_output, false);
-    }
+    // Configure GEMM
+    configure_mm(&_input_im2col_reshaped, weights, &_gemm_output);
+
     _input_im2col_reshaped.allocator()->allocate();
 
     // Configure Col2Im
@@ -245,38 +183,53 @@
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
 
     // Allocate intermediate tensor
-    if(!_are_weights_reshaped)
+    _weights_reshaped.allocator()->allocate();
+
+    //Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+
+    if(_is_activationlayer_enabled)
     {
-        _weights_reshaped.allocator()->allocate();
+        _activationlayer_function.configure(output, nullptr, act_info);
     }
+
+    ARM_COMPUTE_UNUSED(weights_info);
 }
 
 void GCConvolutionLayer::run()
 {
     // Run weights reshaping (Runs once for every configure)
-    if(!_are_weights_reshaped)
+    if(_is_first_run)
     {
-        _are_weights_reshaped = true;
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
         _reshape_weights.run();
+        _is_first_run = false;
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
     }
 
+    _memory_group.acquire();
+
     // Run im2col
     GCScheduler::get().dispatch(_fill_border);
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(_input_im2col_kernel);
 
-    if(!_is_fully_connected_convolution)
-    {
-        GCScheduler::get().memory_barrier();
-        // Run interleave4x4
-        GCScheduler::get().dispatch(_input_interleave_kernel);
-    }
-
-    GCScheduler::get().memory_barrier();
-    // Runs matrix multiply on reshaped matrices
-    GCScheduler::get().dispatch(_mm_kernel);
+    // Run gemm on reshaped matrices
+    _mm_gemm.run();
 
     GCScheduler::get().memory_barrier();
     // Reshape output matrix
     GCScheduler::get().dispatch(_output_col2im_kernel, false);
+
+    _memory_group.release();
+
+    GCScheduler::get().memory_barrier();
+    // Run Activation Layer
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
index 9cba371..7121654 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp

@@ -35,10 +35,10 @@
 {
 }
 
-void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
+void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
 {
     auto k = arm_compute::support::cpp14::make_unique<GCDepthwiseConvolutionLayer3x3Kernel>();
-    k->configure(input, weights, biases, output, conv_info);
+    k->configure(input, weights, biases, output, conv_info, depth_multiplier);
     _kernel = std::move(k);
 
     // Configure border handler

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
index a2607d4..c0cf098 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp

@@ -39,26 +39,27 @@
 {
 }
 
-void GCDirectConvolutionLayer::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
+void GCDirectConvolutionLayer::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
+                                         const ActivationLayerInfo &act_info)
 {
     int kernel_size = weights->info()->dimension(0);
 
     if(kernel_size == 1)
     {
         auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer1x1Kernel>();
-        k->configure(input, weights, biases, output, conv_info);
+        k->configure(input, weights, biases, output, conv_info, act_info);
         _kernel = std::move(k);
     }
     else if(kernel_size == 3)
     {
         auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer3x3Kernel>();
-        k->configure(input, weights, biases, output, conv_info);
+        k->configure(input, weights, biases, output, conv_info, act_info);
         _kernel = std::move(k);
     }
     else if(kernel_size == 5)
     {
         auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer5x5Kernel>();
-        k->configure(input, weights, biases, output, conv_info);
+        k->configure(input, weights, biases, output, conv_info, act_info);
         _kernel = std::move(k);
     }
     else
@@ -79,4 +80,6 @@
     GCScheduler::get().dispatch(_border_handler, false);
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(*_kernel);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(_shift_handler);
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
index 9e4f0f6..a300033 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp

@@ -38,9 +38,9 @@
     _kernel = std::move(k);
 }
 
-GCFullyConnectedLayer::GCFullyConnectedLayer()
-    : _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true),
-      _accumulate_biases(false)
+GCFullyConnectedLayer::GCFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(),
+      _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false)
 {
 }
 
@@ -61,6 +61,7 @@
     _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt));
 
     // Configure im2col kernel
+    _memory_group.manage(&_im2col_output);
     _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
 
     // Configure matrix multiply kernel
@@ -78,7 +79,8 @@
     _mm_kernel.configure(input, weights, output, 1.0f, false);
 }
 
-void GCFullyConnectedLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, bool transpose_weights, bool are_weights_reshaped)
+void GCFullyConnectedLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output,
+                                      bool transpose_weights, bool are_weights_reshaped, bool retain_internal_weights)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
@@ -140,11 +142,14 @@
     }
 
     // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
-    if(!_are_weights_reshaped)
+    if(!_are_weights_reshaped && !retain_internal_weights)
     {
         // Allocate the tensor for the weights reshaped
         _reshape_weights_output.allocator()->allocate();
     }
+
+    ARM_COMPUTE_ERROR_ON(retain_internal_weights && _reshape_weights_output.gc_buffer() == 0);
+    _are_weights_reshaped = _are_weights_reshaped || retain_internal_weights;
 }
 
 void GCFullyConnectedLayer::run()
@@ -156,6 +161,8 @@
         _reshape_weights_kernel.run();
     }
 
+    _memory_group.acquire();
+
     // Linearize input if it comes from a convolutional layer
     if(_is_fc_after_conv)
     {
@@ -177,4 +184,6 @@
 
         GCScheduler::get().dispatch(_accumulate_biases_kernel);
     }
+
+    _memory_group.release();
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
index 5122c20..79f8f71 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp

@@ -38,59 +38,90 @@
 #include "arm_compute/runtime/ITensorAllocator.h"
 
 using namespace arm_compute;
-using namespace arm_compute::gles_compute;
 
-GCGEMM::GCGEMM()
-    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false)
+namespace
+{
+Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const IGCTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info = GEMMInfo())
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+    if(c != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info());
+        ARM_COMPUTE_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+        ARM_COMPUTE_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
+    }
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != output->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != output->dimension(1), "The output matrix must have the same number of rows as the matrix A");
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_UNUSED(gemm_info);
+    return Status{};
+}
+} // namespace
+
+GCGEMM::GCGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
+      _is_first_run(true), _reshape_b_only_on_first_run(false)
 {
 }
 
 void GCGEMM::configure(const IGCTensor *a, const IGCTensor *b, const IGCTensor *c, IGCTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
-    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
-    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.reshape_b_only_on_first_run(), "Reshape matrix B only on first run is not supported");
-    ARM_COMPUTE_UNUSED(gemm_info);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
-    if(c != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
-        ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
-        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
-        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
-    }
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(a->info(), b->info(), c, output->info(), alpha, beta, gemm_info));
 
-    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-
-    // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
-    _is_interleaved_transposed = a->info()->dimension(1) > 16;
+    // Check if we need to reshape the matrix B only on the first run
+    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
 
     const IGCTensor *matrix_a = a;
     const IGCTensor *matrix_b = b;
 
+    // Get the GPU target
+    const GPUTarget gpu_target = GCScheduler::get().get_target();
+
+    // Set the target for the kernels
+    _interleave_kernel.set_target(gpu_target);
+    _mm_kernel.set_target(gpu_target);
+
+    // Arguments used by GEMMReshapeInfo
+    // If we pass the matrix A and matrix B reshaped to GCGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GCGEMMReshapeInfo
+    // in order to know how the matrices have been reshaped
+    const int m                         = a->info()->dimension(1);
+    const int n                         = b->info()->dimension(0);
+    const int k                         = a->info()->dimension(0);
+    int       mult_transpose1xW_width   = 1;
+    int       mult_interleave4x4_height = 1;
+
+    // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
+    _is_interleaved_transposed = a->info()->dimension(1) > 16;
+
     if(_is_interleaved_transposed)
     {
         matrix_a = &_tmp_a;
         matrix_b = &_tmp_b;
 
-        TensorShape shape_tmp_a = a->info()->tensor_shape();
-        TensorShape shape_tmp_b = b->info()->tensor_shape();
-
-        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
-        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
-
-        const unsigned int transpose_w = max_gc_vector_width / data_size_from_type(b->info()->data_type());
-        shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
-        shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
-
-        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
-        _tmp_a.allocator()->init(info_a);
-
-        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), b->info()->fixed_point_position());
-        _tmp_b.allocator()->init(info_b);
+        // Manage intermediate buffers
+        _memory_group.manage(&_tmp_a);
+        if(!_reshape_b_only_on_first_run)
+        {
+            _memory_group.manage(&_tmp_b);
+        }
+        // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
 
         // Configure interleave kernel
         _interleave_kernel.configure(a, &_tmp_a);
@@ -99,7 +130,7 @@
         _transpose_kernel.configure(b, &_tmp_b);
     }
 
-    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed);
+    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height));
 
     if(_is_interleaved_transposed)
     {
@@ -116,15 +147,31 @@
     }
 }
 
+Status GCGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const IGCTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(a, b, c, output, alpha, beta, gemm_info));
+    return Status{};
+}
+
 void GCGEMM::run()
 {
+    _memory_group.acquire();
     if(_is_interleaved_transposed)
     {
         // Run interleave kernel
         GCScheduler::get().dispatch(_interleave_kernel, false);
 
-        // Run transpose kernel
-        GCScheduler::get().dispatch(_transpose_kernel, false);
+        if(_is_first_run)
+        {
+            // Run transpose kernel
+            GCScheduler::get().dispatch(_transpose_kernel, false);
+            _is_first_run = false;
+        }
+        else if(!_reshape_b_only_on_first_run)
+        {
+            // Run transpose kernel
+            GCScheduler::get().dispatch(_transpose_kernel, false);
+        }
         GCScheduler::get().memory_barrier();
     }
 
@@ -137,4 +184,5 @@
         GCScheduler::get().memory_barrier();
         GCScheduler::get().dispatch(_ma_kernel);
     }
+    _memory_group.release();
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
index fc3882d..b2e69ee 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,8 +33,8 @@
 
 using namespace arm_compute;
 
-GCNormalizationLayer::GCNormalizationLayer()
-    : _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler()
+GCNormalizationLayer::GCNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler()
 {
 }
 
@@ -43,6 +43,7 @@
     ARM_COMPUTE_ERROR_ON(input == nullptr);
 
     _squared_input.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, input->info()->data_type()));
+    _memory_group.manage(&_squared_input);
 
     _norm_kernel.configure(input, &_squared_input, output, norm_info);
     _multiply_kernel.configure(input, input, &_squared_input, 1.0f);
@@ -55,9 +56,13 @@
 
 void GCNormalizationLayer::run()
 {
+    _memory_group.acquire();
+
     GCScheduler::get().dispatch(_multiply_kernel, false);
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(_border_handler, false);
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(_norm_kernel, true);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
index 5221c5c..1748a59 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,8 +29,8 @@
 
 using namespace arm_compute;
 
-GCSoftmaxLayer::GCSoftmaxLayer()
-    : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+GCSoftmaxLayer::GCSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
 {
 }
 
@@ -50,6 +50,11 @@
     _max.allocator()->init(tensor_info_max_sum);
     _sum.allocator()->init(tensor_info_max_sum);
 
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp);
+    _memory_group.manage(&_max);
+    _memory_group.manage(&_sum);
+
     // Configure Kernels
     _max_kernel.configure(input, &_max);
     _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
@@ -63,9 +68,13 @@
 
 void GCSoftmaxLayer::run()
 {
+    _memory_group.acquire();
+
     GCScheduler::get().dispatch(_max_kernel, false);
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(_shift_exp_sum_kernel, false);
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(_norm_kernel);
+
+    _memory_group.release();
 }

diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index 583cb40..54a2bd2 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp

@@ -23,202 +23,20 @@
  */
 #include "arm_compute/runtime/IScheduler.h"
 
-#include <array>
-#include <cstdlib>
-#include <cstring>
-#include <fcntl.h>
-#include <fstream>
-#include <map>
-#include <sched.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#ifndef BARE_METAL
-#include <regex>
-#include <thread>
-#endif /* BARE_METAL */
-
-namespace
-{
-unsigned int get_threads_hint()
-{
-    unsigned int num_threads_hint = 1;
-
-#ifndef BARE_METAL
-    std::map<std::string, unsigned int> cpu_part_occurrence_map;
-
-    // CPU part regex
-    std::regex  cpu_part_rgx(R"(.*CPU part.+?(?=:).+?(?=\w+)(\w+).*)");
-    std::smatch cpu_part_match;
-
-    // Read cpuinfo and get occurrence of each core
-    std::ifstream cpuinfo;
-    cpuinfo.open("/proc/cpuinfo", std::ios::in);
-    if(cpuinfo.is_open())
-    {
-        std::string line;
-        while(bool(getline(cpuinfo, line)))
-        {
-            if(std::regex_search(line.cbegin(), line.cend(), cpu_part_match, cpu_part_rgx))
-            {
-                std::string cpu_part = cpu_part_match[1];
-                if(cpu_part_occurrence_map.find(cpu_part) != cpu_part_occurrence_map.end())
-                {
-                    cpu_part_occurrence_map[cpu_part]++;
-                }
-                else
-                {
-                    cpu_part_occurrence_map[cpu_part] = 1;
-                }
-            }
-        }
-    }
-
-    // Get min number of threads
-    auto min_common_cores = std::min_element(cpu_part_occurrence_map.begin(), cpu_part_occurrence_map.end(),
-                                             [](const std::pair<std::string, unsigned int> &p1, const std::pair<std::string, unsigned int> &p2)
-    {
-        return p1.second < p2.second;
-    });
-
-    // Set thread hint
-    num_threads_hint = cpu_part_occurrence_map.empty() ? std::thread::hardware_concurrency() : min_common_cores->second;
-#endif /* BARE_METAL */
-
-    return num_threads_hint;
-}
-
-unsigned int get_cpu_impl()
-{
-#ifndef BARE_METAL
-    int fd = open("/proc/cpuinfo", 0); // NOLINT
-    std::array<char, 3000> buff{ {} };
-    char *pos     = nullptr;
-    char *end     = nullptr;
-    bool  foundid = false;
-
-    int cpu = sched_getcpu();
-
-    if(fd == -1)
-    {
-        return 0;
-    }
-
-    int charsread = read(fd, buff.data(), 3000);
-    pos           = buff.data();
-    end           = buff.data() + charsread;
-
-    close(fd);
-
-    /* So, to date I've encountered two formats for /proc/cpuinfo.
-     *
-     * One of them just lists processor : n  for each processor (with no
-     * other info), then at the end lists part information for the current
-     * CPU.
-     *
-     * The other has an entire clause (including part number info) for each
-     * CPU in the system, with "processor : n" headers.
-     *
-     * We can cope with either of these formats by waiting to see
-     * "processor: n" (where n = our CPU ID), and then looking for the next
-     * "CPU part" field.
-     */
-    while(pos < end)
-    {
-        if(foundid && strncmp(pos, "CPU part", 8) == 0)
-        {
-            /* Found part number */
-            pos += 11;
-
-            for(char *ch = pos; ch < end; ch++)
-            {
-                if(*ch == '\n')
-                {
-                    *ch = '\0';
-                    break;
-                }
-            }
-
-            return strtoul(pos, nullptr, 0);
-        }
-
-        if(strncmp(pos, "processor", 9) == 0)
-        {
-            /* Found processor ID, see if it's ours. */
-            pos += 11;
-
-            for(char *ch = pos; ch < end; ch++)
-            {
-                if(*ch == '\n')
-                {
-                    *ch = '\0';
-                    break;
-                }
-            }
-
-            int num = strtol(pos, nullptr, 0);
-
-            if(num == cpu)
-            {
-                foundid = true;
-            }
-        }
-
-        while(pos < end)
-        {
-            char ch = *pos++;
-            if(ch == '\n' || ch == '\0')
-            {
-                break;
-            }
-        }
-    }
-#endif /* BARE_METAL */
-
-    return 0;
-}
-} // namespace
+#include "arm_compute/runtime/CPUUtils.h"
 
 namespace arm_compute
 {
 IScheduler::IScheduler()
+    : _cpu_info()
 {
     // Work out the best possible number of execution threads
     _num_threads_hint = get_threads_hint();
-
-    // Work out the CPU implementation
-    switch(get_cpu_impl())
-    {
-        case 0xd0f:
-            _info.CPU = CPUTarget::A55_DOT;
-            break;
-        case 0xd03:
-            _info.CPU = CPUTarget::A53;
-            break;
-        default:
-#ifdef __arm__
-            _info.CPU = CPUTarget::ARMV7;
-#elif __aarch64__
-            _info.CPU = CPUTarget::ARMV8;
-#else  /* __arm__ || __aarch64__ */
-            _info.CPU = CPUTarget::INTRINSICS;
-#endif /* __arm__ || __aarch64__ */
-            break;
-    }
-
-    _info.L1_size = 31000;
-    _info.L2_size = 500000;
 }
 
-void IScheduler::set_target(CPUTarget target)
+CPUInfo &IScheduler::cpu_info()
 {
-    _info.CPU = target;
-}
-
-CPUInfo IScheduler::cpu_info() const
-{
-    return _info;
+    return _cpu_info;
 }
 
 unsigned int IScheduler::num_threads_hint() const

diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp
index 2c64475..faaff8a 100644
--- a/src/runtime/ISimpleLifetimeManager.cpp
+++ b/src/runtime/ISimpleLifetimeManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,7 +37,7 @@
 using namespace arm_compute;
 
 ISimpleLifetimeManager::ISimpleLifetimeManager()
-    : _active_group(nullptr), _active_elements(), _finalized_groups()
+    : _active_group(nullptr), _active_elements(), _free_blobs(), _occupied_blobs(), _finalized_groups()
 {
 }
 
@@ -53,14 +53,21 @@
 void ISimpleLifetimeManager::start_lifetime(void *obj)
 {
     ARM_COMPUTE_ERROR_ON(obj == nullptr);
-    ARM_COMPUTE_ERROR_ON_MSG(std::find_if(std::begin(_active_elements), std::end(_active_elements), [&obj](const Element & e)
+    ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements), "Memory object is already registered!");
+
+    // Check if there is a free blob
+    if(_free_blobs.empty())
     {
-        return obj == e.id;
-    }) != std::end(_active_elements),
-    "Memory object is already registered!");
+        _occupied_blobs.emplace_front(Blob{ obj, 0, { obj } });
+    }
+    else
+    {
+        _occupied_blobs.splice(std::begin(_occupied_blobs), _free_blobs, std::begin(_free_blobs));
+        _occupied_blobs.front().id = obj;
+    }
 
     // Insert object in groups and mark its finalized state to false
-    _active_elements.emplace_back(obj);
+    _active_elements.insert(std::make_pair(obj, obj));
 }
 
 void ISimpleLifetimeManager::end_lifetime(void *obj, void **handle, size_t size)
@@ -68,36 +75,50 @@
     ARM_COMPUTE_ERROR_ON(obj == nullptr);
 
     // Find object
-    auto it = std::find_if(std::begin(_active_elements), std::end(_active_elements), [&obj](const Element & e)
-    {
-        return obj == e.id;
-    });
-    ARM_COMPUTE_ERROR_ON(it == std::end(_active_elements));
+    auto active_object_it = _active_elements.find(obj);
+    ARM_COMPUTE_ERROR_ON(active_object_it == std::end(_active_elements));
 
     // Update object fields and mark object as complete
-    it->handle = handle;
-    it->size   = size;
-    it->status = true;
+    Element &el = active_object_it->second;
+    el.handle   = handle;
+    el.size     = size;
+    el.status   = true;
+
+    // Find object in the occupied lists
+    auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), [&obj](const Blob & b)
+    {
+        return obj == b.id;
+    });
+    ARM_COMPUTE_ERROR_ON(occupied_blob_it == std::end(_occupied_blobs));
+
+    // Update occupied blob and return as free
+    occupied_blob_it->bound_elements.insert(obj);
+    occupied_blob_it->max_size = std::max(occupied_blob_it->max_size, size);
+    occupied_blob_it->id       = nullptr;
+    _free_blobs.splice(std::begin(_free_blobs), _occupied_blobs, occupied_blob_it);
 
     // Check if all object are finalized and reset active group
     if(are_all_finalized())
     {
-        // Update finalized groups
-        _finalized_groups[_active_group].insert(std::end(_finalized_groups[_active_group]), std::begin(_active_elements), std::end(_active_elements));
+        ARM_COMPUTE_ERROR_ON(!_occupied_blobs.empty());
 
         // Update blobs and group mappings
         update_blobs_and_mappings();
 
+        // Update finalized groups
+        _finalized_groups[_active_group] = std::move(_active_elements);
+
         // Reset state
         _active_elements.clear();
         _active_group = nullptr;
+        _free_blobs.clear();
     }
 }
 
 bool ISimpleLifetimeManager::are_all_finalized() const
 {
-    return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const Element e)
+    return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const std::pair<void *, Element> &e)
     {
-        return !e.status;
+        return !e.second.status;
     });
 }

diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp
index 35d0c82..15bbb17 100644
--- a/src/runtime/Memory.cpp
+++ b/src/runtime/Memory.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,40 +23,45 @@
  */
 #include "arm_compute/runtime/Memory.h"
 
-#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/MemoryRegion.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 Memory::Memory()
-    : _memory(nullptr), _memory_owned(nullptr)
+    : _region(nullptr), _region_owned(nullptr)
 {
+    create_empty_region();
 }
 
-Memory::Memory(std::shared_ptr<uint8_t> memory)
-    : _memory(nullptr), _memory_owned(std::move(memory))
+Memory::Memory(std::shared_ptr<IMemoryRegion> memory)
+    : _region(nullptr), _region_owned(std::move(memory))
 {
-    ARM_COMPUTE_ERROR_ON(_memory_owned.get() == nullptr);
-    _memory = _memory_owned.get();
+    if(_region_owned == nullptr)
+    {
+        create_empty_region();
+    }
+    _region = _region_owned.get();
 }
 
-Memory::Memory(uint8_t *memory)
-    : _memory(memory), _memory_owned(nullptr)
+Memory::Memory(IMemoryRegion *memory)
+    : _region(memory), _region_owned(nullptr)
 {
-    ARM_COMPUTE_ERROR_ON(memory == nullptr);
+    _region = memory;
 }
 
-uint8_t *Memory::buffer()
+IMemoryRegion *Memory::region()
 {
-    return _memory;
+    return _region;
 }
 
-uint8_t *Memory::buffer() const
+IMemoryRegion *Memory::region() const
 {
-    return _memory;
+    return _region;
 }
 
-uint8_t **Memory::handle()
+void Memory::create_empty_region()
 {
-    ARM_COMPUTE_ERROR_ON(_memory_owned.get() != nullptr);
-    return &_memory;
-}
\ No newline at end of file
+    _region_owned = std::make_shared<MemoryRegion>(0);
+    _region       = _region_owned.get();
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
new file mode 100644
index 0000000..b5b159a
--- /dev/null
+++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp

@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
+
+using namespace arm_compute;
+
+NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights()
+    : _kernel()
+{
+}
+
+void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape,
+                                               DataLayout data_layout)
+{
+    _kernel.configure(input, output, original_input_shape, data_layout);
+}
+
+Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
+                                                DataLayout data_layout)
+{
+    return NEConvertFullyConnectedWeightsKernel::validate(input, output, original_input_shape, data_layout);
+}
+
+void NEConvertFullyConnectedWeights::run()
+{
+    NEScheduler::get().schedule(&_kernel, Window::DimZ);
+}

diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 0a49158..7053c7e 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp

@@ -30,41 +30,44 @@
 
 #include <cmath>
 #include <tuple>
+#include <utility>
 
 namespace arm_compute
 {
-NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_manager(std::move(memory_manager)), _function()
+NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) //NOLINT
+    : _memory_manager(std::move(memory_manager)),
+      _function()
 {
 }
 
-void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
+                                                            enable_fast_math));
 
-    switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
-                                                      weights_info))
+    switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info))
     {
         case ConvolutionMethod::WINOGRAD:
         {
-            auto f = arm_compute::support::cpp14::make_unique<NEWinogradLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info);
+            auto f = arm_compute::support::cpp14::make_unique<NEWinogradConvolutionLayer>(_memory_manager);
+            f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
             _function = std::move(f);
             break;
         }
         case ConvolutionMethod::GEMM:
         {
             auto f = arm_compute::support::cpp14::make_unique<NEGEMMConvolutionLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info, weights_info);
+            f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info);
             _function = std::move(f);
             break;
         }
         case ConvolutionMethod::DIRECT:
         {
             auto f = arm_compute::support::cpp14::make_unique<NEDirectConvolutionLayer>(_memory_manager);
-            f->configure(input, weights, biases, output, conv_info);
+            f->configure(input, weights, biases, output, conv_info, act_info);
             _function = std::move(f);
             break;
         }
@@ -75,21 +78,21 @@
 }
 
 Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info)
+                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
-    switch(NEConvolutionLayer::get_convolution_method(input, weights, biases, output, conv_info, weights_info))
+    switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info))
     {
         case ConvolutionMethod::WINOGRAD:
             //Validate Winograd
-            NEWinogradLayer::validate(input, weights, biases, output, conv_info);
+            ARM_COMPUTE_RETURN_ON_ERROR(NEWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
             break;
         case ConvolutionMethod::GEMM:
             //Validate Gemm-based Convolution
-            NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info);
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info));
             break;
         case ConvolutionMethod::DIRECT:
             //Validate Gemm-based Convolution
-            NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info);
+            ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
         default:
             ARM_COMPUTE_ERROR("Not supported.");
             break;
@@ -98,17 +101,20 @@
     return Status{};
 }
 
-ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                             const WeightsInfo &weights_info)
+ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
+                                                             const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                                             const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
 {
-    ARM_COMPUTE_UNUSED(output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
     ARM_COMPUTE_UNUSED(weights_info);
-    if((input->data_type() == DataType::F32) && (weights->dimension(0) == 3) && (weights->dimension(1) == 3) && (weights->num_dimensions() <= 4) && (conv_info.stride().first == 1)
-       && (conv_info.stride().second == 1) && (biases != nullptr))
+
+    if(dilation != Size2D(1U, 1U) || Scheduler::get().cpu_info().get_cpu_model() == CPUModel::A53
+       || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
     {
-        return ConvolutionMethod::WINOGRAD;
+        return ConvolutionMethod::GEMM;
     }
-    return ConvolutionMethod::GEMM;
+
+    return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
 }
 
 void NEConvolutionLayer::run()

diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index c1ba5dd..40ada8f 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp

@@ -34,6 +34,7 @@
 NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
       _conv_f(),
+      _upsample_f(),
       _scaled_output(),
       _input(nullptr),
       _info(),
@@ -41,13 +42,64 @@
 {
 }
 
+Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info,
+                                      unsigned int inner_border_right, unsigned int inner_border_top)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
+
+    const unsigned int stride_x = info.stride().first;
+    const unsigned int stride_y = info.stride().second;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_right > stride_x - 1, "inner_border_right must be smaller than stride_x");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_top > stride_y - 1, "inner_border_top must be smaller than stride_y");
+
+    auto out_dims = deconvolution_output_dimensions(input->dimension(0), input->dimension(1), weights->dimension(0), weights->dimension(1),
+                                                    info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights, bias);
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, bias);
+    }
+
+    if(output->tensor_shape().total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+        const TensorShape output_shape = deconvolution_output_shape(out_dims, input->tensor_shape(), weights->tensor_shape());
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+    }
+
+    TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_deconvolution_shape(*input, stride_x, stride_y, inner_border_right, inner_border_top,
+                                                                                                      info)));
+    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != scale_out_info.dimension(i));
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo()));
+
+    return Status{};
+}
+
 void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info,
                                      unsigned int inner_border_right, unsigned int inner_border_top)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(!info.padding_is_symmetric());
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
     _input        = input;
     _info         = info;
@@ -55,15 +107,9 @@
 
     const unsigned int stride_x = info.stride().first;
     const unsigned int stride_y = info.stride().second;
-    auto               out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
-                                                                  info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
 
-    const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape());
-
-    ARM_COMPUTE_UNUSED(output_shape);
-    ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
-    ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
-    ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
 
     _memory_group.manage(&_scaled_output);
 
@@ -79,44 +125,20 @@
 
     // Allocate auxiliary tensors
     _scaled_output.allocator()->allocate();
+
+    // configure upsample function
+    _upsample_f.configure(input, &_scaled_output, info, inner_border_right, inner_border_top);
 }
 
 void NEDeconvolutionLayer::run()
 {
     _memory_group.acquire();
 
-    // Initialize _scaled_output buffer
-    const int width_in      = _input->info()->dimension(0);
-    const int height_in     = _input->info()->dimension(1);
-    const int width_scaled  = _scaled_output.info()->dimension(0);
-    const int height_scaled = _scaled_output.info()->dimension(1);
-    const int num_2d_slices = _input->info()->tensor_shape().total_size() / (width_in * height_in);
-    const int stride_x      = _info.stride().first;
-    const int stride_y      = _info.stride().second;
-
-    std::fill_n(_scaled_output.buffer(), _scaled_output.info()->total_size(), 0);
-
-    // scaled_output is the input for the forward convolution. We copy the input elements to scaled_output
-    // and insert rows and columns with zeroes depending on the stride values.
-    for(int slice = 0; slice < num_2d_slices; ++slice)
-    {
-        const int start_x = _info.pad().first;
-        const int start_y = _inner_border.second + _info.pad().second;
-        const int end_y   = height_scaled - _info.pad().second;
-        const int end_x   = width_scaled - _inner_border.first - _info.pad().first;
-
-        for(int yi = start_y, in_y = 0; yi < end_y; yi += stride_y, in_y++)
-        {
-            for(int xi = start_x, in_x = 0; xi < end_x; xi += stride_x, in_x++)
-            {
-                const auto in = *(reinterpret_cast<float *>(_input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(in_x, in_y, slice))));
-                *(reinterpret_cast<float *>(_scaled_output.buffer() + _scaled_output.info()->offset_element_in_bytes(Coordinates(xi, yi, slice)))) = in;
-            }
-        }
-    }
+    // Run upsample kernel
+    _upsample_f.run();
 
     // Run convolution layer
     _conv_f.run();
 
     _memory_group.release();
-}
+}
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 95fcf88..0a977ad 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp

@@ -37,11 +37,11 @@
 
 NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
     : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _input_nhwc(), _weights_hwio(), _output_nhwc(), _has_bias(false),
-      _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false)
+      _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true)
 {
 }
 
-void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -52,30 +52,39 @@
     _has_bias     = biases != nullptr;
     _is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->info()->tensor_shape(),
                                                                                           conv_info,
-                                                                                          input->info()->data_type());
+                                                                                          input->info()->data_type(),
+                                                                                          depth_multiplier,
+                                                                                          input->info()->data_layout());
     _are_weights_reshaped = false;
+    _is_nchw              = input->info()->data_layout() == DataLayout::NCHW;
+
+    ARM_COMPUTE_ERROR_ON(!_is_optimized && !_is_nchw);
 
     if(_is_optimized)
     {
-        // Configure the function to transform the input tensor from NCHW -> NHWC
-        _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+        if(_is_nchw)
+        {
+            // Configure the function to transform the input tensor from NCHW -> NHWC
+            _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
 
-        // Configure the function to transform the weights tensor from IHW -> HWI
-        _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
+            // Configure the function to transform the weights tensor from IHW -> HWI
+            _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
 
-        // Configure optimized depthwise
-        _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, DataLayout::NHWC);
+            // Configure optimized depthwise
+            _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, depth_multiplier, DataLayout::NHWC);
 
-        // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-        _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
+            // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+            _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
 
-        // Allocate tensors
-        _input_nhwc.allocator()->allocate();
-        _weights_hwio.allocator()->allocate();
-        _output_nhwc.allocator()->allocate();
-
-        // Create convolver (deferred)
-        _dwc_kernel.generate_convolver();
+            // Allocate tensors
+            _input_nhwc.allocator()->allocate();
+            _weights_hwio.allocator()->allocate();
+            _output_nhwc.allocator()->allocate();
+        }
+        else
+        {
+            _dwc_kernel.configure(input, weights, output, conv_info, depth_multiplier, DataLayout::NHWC);
+        }
     }
     else
     {
@@ -88,7 +97,7 @@
         }
 
         // Configure depthwise convolution kernel
-        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info);
+        _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
 
         // Configure border handler
         _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
@@ -116,8 +125,15 @@
 
 void NEDepthwiseConvolutionLayer3x3::run()
 {
+    if(_is_first_run && _is_optimized)
+    {
+        _is_first_run = false;
+        // Create convolver (deferred)
+        _dwc_kernel.generate_convolver();
+    }
+
     // Permute weights in HWIO format if the optimized kernel will be executedd
-    if(!_are_weights_reshaped && _is_optimized)
+    if(!_are_weights_reshaped && _is_optimized && _is_nchw)
     {
         _are_weights_reshaped = true;
         _permute_weights.run();
@@ -126,8 +142,11 @@
     // Handle input
     if(_is_optimized)
     {
-        // Permute input to NHWC format execution
-        _permute_input.run();
+        if(_is_nchw)
+        {
+            // Permute input to NHWC format execution
+            _permute_input.run();
+        }
     }
     else
     {
@@ -139,7 +158,7 @@
     NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
 
     // Permute output to ACL's native NCHW format in case of NHWC execution
-    if(_is_optimized)
+    if(_is_optimized && _is_nchw)
     {
         _permute_output.run();
     }
@@ -153,31 +172,37 @@
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
-      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_quantized(false)
+      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
 {
 }
 
-void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON((input->info()->dimension(2) * depth_multiplier) != weights->info()->dimension(2));
 
     const size_t weights_w = weights->info()->dimension(0);
     const size_t weights_h = weights->info()->dimension(1);
     const size_t weights_z = weights->info()->dimension(2);
 
-    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _is_first_run     = true;
+    _original_weights = weights;
 
     // Should bias be appended ?
     bool append_bias = (biases != nullptr) && !_is_quantized;
 
     // Calculate output shape
-    TensorShape dwc_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info);
+    TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
 
     // Output width and height
-    const unsigned int conv_w = dwc_output_shape.x();
-    const unsigned int conv_h = dwc_output_shape.y();
+    const unsigned int conv_w = output_shape.x();
+    const unsigned int conv_h = output_shape.y();
 
     // Set up intermediate tensors
     const size_t patch_size = weights_w * weights_h + (append_bias ? 1 : 0);
@@ -189,7 +214,7 @@
     shape_im2col.set(1, conv_size);
     shape_im2col.set(2, weights_z);
     _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias);
+    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
 
     // Weights reshape configuration
     const TensorShape shape_weights_reshape(patch_size, weights_z);
@@ -204,7 +229,7 @@
     shape_v2mm_out.set(2, 1);
     _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
     _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
-    _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(dwc_output_shape));
+    _output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
     _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
 
     // Output staged configuration
@@ -241,10 +266,21 @@
 
 void NEDepthwiseConvolutionLayer::run()
 {
+    // Run weights reshaping (Runs once for every configure)
+    if(_is_first_run)
+    {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
+        NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
+        _is_first_run = false;
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
+    }
+
     NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
-    NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
     NEScheduler::get().schedule(&_v2mm_input_fill_border, Window::DimX);
-    NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
     NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX);
     NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX);
     if(_is_quantized)

diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index a58b6e4..0627977 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 using namespace arm_compute;
@@ -34,8 +35,18 @@
 {
 }
 
+Status NEDequantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *min_max)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, min_max);
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayerKernel::validate(input, output, min_max));
+
+    return Status{};
+}
+
 void NEDequantizationLayer::configure(const ITensor *input, ITensor *output, const ITensor *min_max)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, min_max);
+
     // Configure kernel
     _dequantize_kernel.configure(input, output, min_max);
 }

diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index c26c99a..445864c 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,18 +34,23 @@
 using namespace arm_compute;
 
 NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _accumulator(), _has_bias(false), _is_fixed_point(false)
+    : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false), _is_fixed_point(false),
+      _is_activationlayer_enabled(false), _dim_split(Window::DimZ)
 {
 }
 
-void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info)
+void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
 {
+    ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
+
     // Free accumulator
     if(_accumulator.buffer() != nullptr)
     {
         _accumulator.allocator()->free();
     }
 
+    _dim_split = input->info()->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY;
+
     // Check if bias should be added in the convolution result
     _has_bias = (bias != nullptr);
 
@@ -73,9 +78,17 @@
 
     // Add zero padding XY
     _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+
+    //Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
 }
 
-Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info)
+Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -101,6 +114,11 @@
     // Validate bias kernel
     ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, bias, output));
 
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
+    }
+
     return Status{};
 }
 
@@ -110,10 +128,15 @@
 
     _memory_group.acquire();
 
-    NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
+    NEScheduler::get().schedule(&_conv_kernel, _dim_split);
     if(_has_bias || _is_fixed_point)
     {
         NEScheduler::get().schedule(&_output_stage_kernel, Window::DimY);
     }
+
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
     _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 26b7271..958d081 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp

@@ -132,7 +132,7 @@
 
 NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(),
-      _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false)
+      _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false), _original_weights(nullptr)
 {
 }
 
@@ -163,6 +163,7 @@
     const int    num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
     const size_t linear_input_size    = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
 
+    _original_weights     = weights;
     _linearize_input      = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
     _are_weights_reshaped = are_weights_reshaped;
     _accumulate_biases    = biases != nullptr;
@@ -187,7 +188,7 @@
 
     if(_linearize_input)
     {
-        _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_shape(input->info(), num_input_dimensions)));
+        _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input->info(), num_input_dimensions)));
 
         // Configure im2col kernel
         _memory_group.manage(&_im2col_output);
@@ -287,7 +288,7 @@
 
     if(linearize_input)
     {
-        im2col_output->set_tensor_shape(compute_im2col_shape(input, num_input_dimensions));
+        im2col_output->set_tensor_shape(compute_im2col_fc_shape(input, num_input_dimensions));
 
         ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, im2col_output.get(), Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, true));
 
@@ -324,8 +325,13 @@
     // Reshape of the weights (happens only once)
     if(!_are_weights_reshaped)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
         _are_weights_reshaped = true;
         _reshape_weights_kernel.run();
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
     }
 
     _memory_group.acquire();

diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 05907ba..9168ed4 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp

@@ -26,37 +26,20 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "support/ToolchainSupport.h"
 
-namespace arm_compute
-{
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wswitch-default"
-#pragma GCC diagnostic ignored "-Weffc++"
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp"
-#pragma GCC diagnostic pop
-} // namespace arm_compute
-
 #include <cmath>
 
 namespace arm_compute
 {
 NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _mm_optimised_kernel(nullptr), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(),
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(), _B_pretransposed(),
       _run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
 {
 }
@@ -83,41 +66,14 @@
     _reshape_b_only_on_first_run      = gemm_info.reshape_b_only_on_first_run();
     _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
 
+    const bool run_optimised = a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f)
+                               && setup_assembly_kernel(a, b, d, alpha, beta, _reshape_b_only_on_first_run, _workspace, _B_pretransposed, _memory_group, _asm_glue);
+
     // Check if the first input tensor is a vector.
     // If so, all the kernels for reshaping the tensors can be skipped
     if(_run_vector_matrix_multiplication)
     {
-#if defined(__aarch64__)
-        if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
-        {
-            _mm_optimised_kernel = support::cpp14::make_unique<NEGEMVAArch64Kernel>();
-        }
-
-        if(_mm_optimised_kernel != nullptr)
-        {
-            struct CPUInfo ci = NEScheduler::get().cpu_info();
-
-            const int N = d->info()->tensor_shape().x();
-            const int K = a->info()->tensor_shape().x();
-
-            size_t workbench_size = 0;
-
-            if(a->info()->data_type() == DataType::F32)
-            {
-                workbench_size = GemvTransposed<sgemv_trans, sgemv_trans::operand_type, sgemv_trans::result_type>(&ci, N, K).get_working_size();
-            }
-
-            constexpr size_t alignment = 4096;
-            ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
-            _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
-            _memory_group.manage(&_workspace);
-
-            // Configure matrix multiplication kernel
-            _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */);
-            _workspace.allocator()->allocate();
-        }
-        else
-#endif /* defined(__aarch64__) */
+        if(!run_optimised)
         {
             // Configure the matrix multiply kernel
             _mm_kernel.configure(a, b, d, alpha, false);
@@ -132,65 +88,7 @@
     }
     else
     {
-#if defined(__arm__)
-        if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
-        {
-            _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
-        }
-#elif defined(__aarch64__)
-        if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
-        {
-            _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
-        }
-        else if(a->info()->data_type() == DataType::F16 && (c == nullptr || beta == 0.f))
-        {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            _mm_optimised_kernel = support::cpp14::make_unique<NEHGEMMAArch64FP16Kernel>();
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        }
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
-#if defined(__arm__) || defined(__aarch64__)
-        if(_mm_optimised_kernel != nullptr)
-        {
-            struct CPUInfo ci = NEScheduler::get().cpu_info();
-
-            const int M = d->info()->tensor_shape().y();
-            const int N = d->info()->tensor_shape().x();
-            const int K = a->info()->tensor_shape().x();
-
-            size_t workbench_size = 0;
-
-#if defined(__arm__)
-            workbench_size = GemmInterleaved<sgemm_8x6, sgemm_8x6::operand_type, sgemm_8x6::result_type>(&ci, M, N, K, false, false).get_working_size();
-#elif defined(__aarch64__)
-            if(a->info()->data_type() == DataType::F32)
-            {
-                workbench_size = GemmInterleaved<sgemm_12x8, sgemm_12x8::operand_type, sgemm_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
-            }
-            else if(a->info()->data_type() == DataType::F16)
-            {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                workbench_size = GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type>(&ci, M, N, K, false, false).get_working_size();
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-                ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            }
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
-            constexpr size_t alignment = 4096;
-            ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
-            _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
-            _memory_group.manage(&_workspace);
-
-            // Configure matrix multiplication kernel
-            _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */);
-            _workspace.allocator()->allocate();
-        }
-        else
-#endif /* defined(__arm__) || defined(__aarch64__) */
+        if(!run_optimised)
         {
             TensorShape shape_tmp_a = a->info()->tensor_shape();
             TensorShape shape_tmp_b = b->info()->tensor_shape();
@@ -210,7 +108,10 @@
 
             // Manage intermediate buffers
             _memory_group.manage(&_tmp_a);
-            _memory_group.manage(&_tmp_b);
+            if(!_reshape_b_only_on_first_run)
+            {
+                _memory_group.manage(&_tmp_b);
+            }
 
             int m = a->info()->dimension(1);
             int n = b->info()->dimension(0);
@@ -243,9 +144,9 @@
 {
     _memory_group.acquire();
 
-    if(_mm_optimised_kernel != nullptr)
+    if(_asm_glue._optimised_kernel != nullptr)
     {
-        NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+        _asm_glue.run();
         _memory_group.release();
     }
     else

diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index a85078c..2888b43 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp

@@ -23,9 +23,6 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
 
-#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
@@ -34,13 +31,6 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "support/ToolchainSupport.h"
 
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
-} // namespace arm_compute
-
 #include <cmath>
 #include <tuple>
 
@@ -175,19 +165,28 @@
     }
 }
 
-Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, DataType &dt,
-                                      bool &append_bias,
+Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+                                      const ActivationLayerInfo &act_info, DataType &dt,
+                                      bool &append_bias, bool &skip_im2col,
                                       bool &are_weights_reshaped, unsigned int &kernel_width, unsigned int &kernel_height,
-                                      bool &is_fully_connected_convolution, bool &is_interleaved, bool &is_quantized,
+                                      bool &is_fully_connected_convolution, bool &is_interleaved, bool &is_quantized, bool &is_activationlayer_enabled,
                                       unsigned int &mat_weights_cols, unsigned int &mat_weights_rows,
-                                      unsigned int &conv_w, unsigned int &conv_h)
+                                      unsigned int &conv_w, unsigned int &conv_h, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && weights->dimension(2) != input->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+    DataLayout data_layout = input->data_layout();
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int  idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && weights->dimension(idx_channel) != input->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(weights_info.are_reshaped() && is_data_type_quantized_asymmetric(input->data_type()));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(data_layout == DataLayout::NHWC && input->data_type() != DataType::F32, "NHWC is only supported for FP32 data type.");
 
     dt           = input->data_type();
     is_quantized = is_data_type_quantized_asymmetric(dt);
@@ -207,28 +206,32 @@
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
     }
 
+    // If we have 1x1 convolution and data layout is NHWC we can disable im2col
     append_bias          = (biases != nullptr) && (!is_quantized);
     are_weights_reshaped = weights_info.are_reshaped();
-    kernel_width         = (are_weights_reshaped) ? weights_info.kernel_size().first : weights->dimension(0);
-    kernel_height        = (are_weights_reshaped) ? weights_info.kernel_size().second : weights->dimension(1);
+    kernel_width         = (are_weights_reshaped) ? weights_info.kernel_size().first : weights->dimension(idx_width);
+    kernel_height        = (are_weights_reshaped) ? weights_info.kernel_size().second : weights->dimension(idx_height);
     mat_weights_cols     = weights->dimension(3);
-    mat_weights_rows     = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (append_bias ? 1 : 0);
+    mat_weights_rows     = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel) + ((append_bias && !skip_im2col) ? 1 : 0);
+    skip_im2col          = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1);
 
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
-                                                 conv_info);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width), input->dimension(idx_height), kernel_width, kernel_height,
+                                                 conv_info, dilation);
 
     // Check if its a "fully connected" convolution
     is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
     is_interleaved                 = (!is_fully_connected_convolution && !is_quantized);
+    is_activationlayer_enabled     = act_info.enabled();
 
     return Status{};
 }
 } // namespace
 
 NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
-    : _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_optimised_kernel(nullptr), _mm_gemmlowp(memory_manager),
-      _gemmlowp_output_stage(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _tmp_output(), _workspace(), _append_bias(false),
-      _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false), _is_interleaved(false)
+    : _asm_glue(), _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
+      _output_col2im_kernel(), _activationlayer_function(), _add_bias_kernel(), _original_weights(nullptr), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(),
+      _tmp_output(), _workspace(), _B_pretransposed(), _data_layout(DataLayout::NCHW), _append_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false),
+      _is_interleaved(false), _is_activationlayer_enabled(false), _skip_im2col(false)
 {
 }
 
@@ -256,26 +259,8 @@
     }
 }
 
-void NEGEMMConvolutionLayer::configure_asm_mm(const struct CPUInfo &ci, int M, int N, int K)
-{
-    ARM_COMPUTE_UNUSED(ci);
-    ARM_COMPUTE_UNUSED(M);
-    ARM_COMPUTE_UNUSED(N);
-    ARM_COMPUTE_UNUSED(K);
-#if defined(__arm__) || defined(__aarch64__)
-#if defined(__arm__)
-    GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
-#elif defined(__aarch64__)
-    GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
-    constexpr size_t alignment = 4096;
-    _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
-    _memory_group.manage(&_workspace);
-#endif /* defined(__arm__) || defined(__aarch64__) */
-}
-
-void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+                                       const Size2D &dilation, const ActivationLayerInfo &act_info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
@@ -288,45 +273,35 @@
     unsigned int conv_w           = 0;
     unsigned int conv_h           = 0;
 
-    Status status = validate_and_initialize_values(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), conv_info, weights_info, dt, _append_bias, _are_weights_reshaped,
+    _data_layout           = input->info()->data_layout();
+    const bool is_nhwc     = _data_layout == DataLayout::NHWC;
+    const int  idx_width   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int  idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+
+    Status status = validate_and_initialize_values(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), conv_info, weights_info, act_info, dt, _append_bias, _skip_im2col,
+                                                   _are_weights_reshaped,
                                                    kernel_width, kernel_height,
-                                                   _is_fully_connected_convolution, _is_interleaved, _is_quantized,
-                                                   mat_weights_cols, mat_weights_rows, conv_w, conv_h);
+                                                   _is_fully_connected_convolution, _is_interleaved, _is_quantized, _is_activationlayer_enabled,
+                                                   mat_weights_cols, mat_weights_rows, conv_w, conv_h, dilation);
 
     ARM_COMPUTE_ERROR_THROW_ON(status);
 
+    _original_weights                       = weights;
     const unsigned int fixed_point_position = input->info()->fixed_point_position();
     const ITensor     *biases_to_use        = (_append_bias) ? biases : nullptr;
 
-#if defined(__arm__)
-    if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
-    {
-        _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
-    }
-#elif defined(__aarch64__)
-    if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
-    {
-        _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
-    }
-#endif /* defined(__arm__) || defined(__aarch64__) */
+    bool run_optimised = dt == DataType::F32;
 
     // Reshape weights if needed
-    if(_mm_optimised_kernel != nullptr)
+    if(run_optimised)
     {
-        if(_are_weights_reshaped)
-        {
-            mat_weights_cols = weights_info.num_kernels();
-            mat_weights_rows = weights->info()->dimension(1);
-        }
-        else
-        {
-            TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
+        TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
 
-            // Create tensor to store the reshaped weights
-            _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
-            _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
-            weights = &_weights_reshaped;
-        }
+        // Create tensor to store the reshaped weights
+        _weights_reshaped.allocator()->init(TensorInfo(reshaped_weights_shape, 1, dt, fixed_point_position));
+        _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+        weights = &_weights_reshaped;
     }
     else
     {
@@ -335,12 +310,12 @@
             if(_is_fully_connected_convolution || _is_quantized)
             {
                 mat_weights_cols = weights_info.num_kernels();
-                mat_weights_rows = weights->info()->dimension(1);
+                mat_weights_rows = weights->info()->dimension(idx_height);
             }
             else
             {
                 mat_weights_cols = weights_info.num_kernels();
-                mat_weights_rows = weights_info.kernel_size().first * weights_info.kernel_size().second * input->info()->dimension(2) + (_append_bias ? 1 : 0);
+                mat_weights_rows = weights_info.kernel_size().first * weights_info.kernel_size().second * input->info()->dimension(idx_channel) + (_append_bias ? 1 : 0);
             }
         }
         else
@@ -366,66 +341,56 @@
         }
     }
 
-    // Create tensor to store im2col reshaped inputs
-    const unsigned int mat_input_cols = mat_weights_rows;
-    const unsigned int mat_input_rows = conv_w * conv_h;
-
-    TensorShape shape_im2col(input->info()->tensor_shape());
-    shape_im2col.set(0, mat_input_cols);
-    shape_im2col.set(1, mat_input_rows);
-    shape_im2col.set(2, 1);
-    _input_im2col_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
-    _memory_group.manage(&_input_im2col_reshaped);
-
-    // Create tensor (interleave) to prepare input tensor for GEMM
-    if(!_is_fully_connected_convolution && _mm_optimised_kernel == nullptr)
+    // In case we skip im2col we have to add bias
+    if(!_skip_im2col)
     {
-        TensorShape shape_interleaved(shape_im2col);
-        shape_interleaved.set(0, shape_interleaved.x() * 4);
-        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
-        _input_interleaved_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_interleaved));
-        _memory_group.manage(&_input_interleaved_reshaped);
+        const unsigned int mat_input_cols = mat_weights_rows;
+        const unsigned int mat_input_rows = conv_w * conv_h;
+
+        // Create tensor to store im2col reshaped inputs
+        TensorShape shape_im2col(input->info()->tensor_shape());
+        shape_im2col.set(0, mat_input_cols);
+        shape_im2col.set(1, mat_input_rows);
+        shape_im2col.set(2, 1);
+        _input_im2col_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+        _memory_group.manage(&_input_im2col_reshaped);
+
+        // Create tensor (interleave) to prepare input tensor for GEMM
+        if(!_is_fully_connected_convolution && !run_optimised && _is_interleaved)
+        {
+            TensorShape shape_interleaved(shape_im2col);
+            shape_interleaved.set(idx_width, shape_interleaved.x() * 4);
+            shape_interleaved.set(idx_height, std::ceil(shape_interleaved[idx_height] / 4.f));
+            _input_interleaved_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_interleaved));
+            _memory_group.manage(&_input_interleaved_reshaped);
+        }
+
+        // Create GEMM output tensor
+        TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
+        shape_gemm.set(0, mat_weights_cols);
+        shape_gemm.set(1, mat_input_rows);
+        const DataType gemm_data_type = _is_quantized ? DataType::S32 : dt;
+        // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
+        TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
+        info_gemm.set_quantization_info(output->info()->quantization_info());
+        _gemm_output.allocator()->init(info_gemm);
+
+        // Configure im2col
+        _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias, false, false, dilation);
+    }
+    else if(_append_bias)
+    {
+        // Configure add bias kernel
+        _add_bias_kernel.configure(output, biases, output, ConvertPolicy::SATURATE);
     }
 
-    // Create GEMM output tensor
-    TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
-    shape_gemm.set(0, mat_weights_cols);
-    shape_gemm.set(1, mat_input_rows);
-    const DataType gemm_data_type = _is_quantized ? DataType::S32 : dt;
-    // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
-    TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
-    info_gemm.set_quantization_info(output->info()->quantization_info());
-    _gemm_output.allocator()->init(info_gemm);
-    _memory_group.manage(&_gemm_output);
-
-    // Configure kernels
-    // Configure im2col
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias);
-
     // Configure matrix multiply
-    if(_mm_optimised_kernel != nullptr)
+    if(run_optimised)
     {
-        struct CPUInfo ci = NEScheduler::get().cpu_info();
-
-        const int M = _gemm_output.info()->tensor_shape().y();
-        const int N = _gemm_output.info()->tensor_shape().x();
-        const int K = _input_im2col_reshaped.info()->tensor_shape().x();
-
-#if defined(__aarch64__)
-        if((N <= 128) && (K <= 128))
+        if(!setup_assembly_kernel(_skip_im2col ? input : &_input_im2col_reshaped, weights, is_nhwc ? output : &_gemm_output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue))
         {
-            _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64NativeKernel>();
+            ARM_COMPUTE_ERROR("setup_assembly_kernel failed.");
         }
-        else
-#endif /* defined(__aarch64__) */
-        {
-            configure_asm_mm(ci, M, N, K);
-        }
-
-        // Configure matrix multiplication kernel
-        _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
-
-        _workspace.allocator()->allocate();
     }
     else
     {
@@ -435,8 +400,8 @@
             _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
 
             // Configure GEMM
-            configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output, _is_interleaved, GEMMReshapeInfo(_input_im2col_reshaped.info()->dimension(1), 0 /* no transpose */,
-                                                                                                                _input_im2col_reshaped.info()->dimension(0)));
+            configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output, _is_interleaved, GEMMReshapeInfo(_input_im2col_reshaped.info()->dimension(idx_height), 0 /* no transpose */,
+                                                                                                                _input_im2col_reshaped.info()->dimension(idx_width)));
             _input_interleaved_reshaped.allocator()->allocate();
         }
         else
@@ -445,48 +410,63 @@
         }
     }
 
-    _input_im2col_reshaped.allocator()->allocate();
-
-    // Configure output stage for quantized case
-    if(_is_quantized)
+    if(!_skip_im2col)
     {
-        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+        _input_im2col_reshaped.allocator()->allocate();
 
-        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-        int   output_multiplier, output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        _memory_group.manage(&_tmp_output);
-        _gemmlowp_output_stage.configure(&_gemm_output, biases, &_tmp_output, output_multiplier, output_shift, output_quant_info.offset);
+        // Configure output stage for quantized case
+        if(_is_quantized)
+        {
+            const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+            float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+            int   output_multiplier, output_shift;
+            quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+            _memory_group.manage(&_tmp_output);
+            _gemmlowp_output_stage.configure(&_gemm_output, biases, &_tmp_output, output_multiplier, output_shift, output_quant_info.offset);
+        }
+
+        // Configure Col2Im
+        if(!is_nhwc)
+        {
+            _output_col2im_kernel.configure(_is_quantized ? &_tmp_output : &_gemm_output, output, Size2D(conv_w, conv_h));
+        }
+
+        if(_is_quantized)
+        {
+            _tmp_output.allocator()->allocate();
+        }
+        _gemm_output.allocator()->allocate();
     }
 
-    // Configure Col2Im
-    _output_col2im_kernel.configure(_is_quantized ? &_tmp_output : &_gemm_output, output, Size2D(conv_w, conv_h));
-    if(_is_quantized)
-    {
-        _tmp_output.allocator()->allocate();
-    }
-    _gemm_output.allocator()->allocate();
-
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h), "Output shape does not match the expected one");
 
     // Allocate intermediate tensor
     if(!_are_weights_reshaped)
     {
         _weights_reshaped.allocator()->allocate();
     }
+
+    //Configure Activation Layer
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
 }
 
 Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info)
+                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(output);
 
     DataType     dt{};
     bool         append_bias{};
+    bool         skip_im2col{};
     bool         are_weights_reshaped{};
     bool         is_fully_connected_convolution{};
     bool         is_interleaved{};
     bool         is_quantized{};
+    bool         is_activationlayer_enabled{};
     unsigned int kernel_width     = 0;
     unsigned int kernel_height    = 0;
     unsigned int mat_weights_cols = 0;
@@ -494,9 +474,14 @@
     unsigned int conv_w           = 0;
     unsigned int conv_h           = 0;
 
-    Status status = validate_and_initialize_values(input, weights, biases, conv_info, weights_info, dt, append_bias, are_weights_reshaped, kernel_width, kernel_height,
-                                                   is_fully_connected_convolution, is_interleaved, is_quantized, mat_weights_cols, mat_weights_rows,
-                                                   conv_w, conv_h);
+    const DataLayout data_layout = input->data_layout();
+    const bool       is_nhwc     = data_layout == DataLayout::NHWC;
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    Status status = validate_and_initialize_values(input, weights, biases, conv_info, weights_info, act_info, dt, append_bias, skip_im2col, are_weights_reshaped, kernel_width, kernel_height,
+                                                   is_fully_connected_convolution, is_interleaved, is_quantized, is_activationlayer_enabled, mat_weights_cols, mat_weights_rows,
+                                                   conv_w, conv_h, dilation);
 
     const Size2D kernel_weights = Size2D(kernel_width, kernel_height);
 
@@ -505,68 +490,11 @@
     std::unique_ptr<ITensorInfo> reshaped_weights = weights->clone();
     bool                         optimised_kernel = false;
 
-#if defined(__arm__)
-    if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
+    if(dt == DataType::F32)
     {
         optimised_kernel = true;
     }
-#elif defined(__aarch64__)
-    if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
-    {
-        optimised_kernel = true;
-    }
-#endif /* defined(__arm__) || defined(__aarch64__) */
 
-    // Reshape weights if needed
-    if(optimised_kernel)
-    {
-        if(are_weights_reshaped)
-        {
-            mat_weights_cols = weights_info.num_kernels();
-            mat_weights_rows = weights->dimension(1);
-        }
-        else
-        {
-            TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
-
-            // Create tensor to store the reshaped weights
-            reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
-            weights = reshaped_weights.get();
-        }
-    }
-    else
-    {
-        if(are_weights_reshaped)
-        {
-            const unsigned int transpose_width = 16 / input->element_size();
-            mat_weights_cols                   = weights_info.num_kernels();
-            mat_weights_rows                   = weights->dimension(0) / transpose_width + (append_bias ? 1 : 0);
-        }
-        else
-        {
-            TensorShape reshaped_weights_shape;
-
-            if(is_fully_connected_convolution || is_quantized)
-            {
-                reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
-            }
-            else
-            {
-                // Create tensor to store transposed weights
-                const float transpose_width = 16.0f / input->element_size();
-                reshaped_weights_shape      = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
-                                                           static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
-            }
-
-            // Create tensor to store the reshaped weights
-            reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
-            weights = reshaped_weights.get();
-        }
-    }
-
-    // Validate im2col
     const unsigned int mat_input_cols = mat_weights_rows;
     const unsigned int mat_input_rows = conv_w * conv_h;
     TensorShape        shape_im2col   = input->tensor_shape();
@@ -574,7 +502,17 @@
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
     TensorInfo im2_col_info = input->clone()->set_tensor_shape(shape_im2col);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2_col_info, kernel_weights, conv_info, append_bias, false));
+
+    if(!skip_im2col)
+    {
+        // Validate im2col
+        ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2_col_info, kernel_weights, conv_info, append_bias, false, false, dilation));
+    }
+    else if(append_bias)
+    {
+        // Validate add bias kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(output, biases, output, ConvertPolicy::SATURATE));
+    }
 
     // Create GEMM output tensor
     TensorShape shape_gemm(im2_col_info.tensor_shape());
@@ -582,19 +520,63 @@
     shape_gemm.set(1, mat_input_rows);
     TensorInfo gemm_output_info = input->clone()->set_tensor_shape(shape_gemm);
 
-    // Validate GEMM interleave and multiply
-    if(is_interleaved)
+    // Reshape weights if needed
+    if(optimised_kernel)
     {
-        TensorShape shape_interleaved = shape_im2col;
-        shape_interleaved.set(0, shape_interleaved.x() * 4);
-        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
-        TensorInfo input_interleaved_info = input->clone()->set_tensor_shape(shape_interleaved);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(&im2_col_info, &input_interleaved_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&input_interleaved_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo()));
+        ARM_COMPUTE_RETURN_ERROR_ON(are_weights_reshaped);
+
+        // Create tensor to store the reshaped weights
+        reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
     }
-    else
+    else if(!is_quantized)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&im2_col_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo()));
+        TensorShape reshaped_weights_shape;
+
+        if(is_fully_connected_convolution || is_quantized)
+        {
+            reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
+        }
+        else
+        {
+            // Create tensor to store transposed weights
+            const float transpose_width = 16.0f / input->element_size();
+            reshaped_weights_shape      = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
+                                                       static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
+        }
+
+        // Create tensor to store the reshaped weights
+        reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, append_bias, is_fully_connected_convolution));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
+        weights = reshaped_weights.get();
+
+        // Validate GEMM interleave and multiply
+        if(is_interleaved)
+        {
+            TensorShape shape_interleaved = shape_im2col;
+            shape_interleaved.set(idx_width, shape_interleaved.x() * 4);
+            shape_interleaved.set(idx_height, std::ceil(shape_interleaved.y() / 4.f));
+            TensorInfo input_interleaved_info = input->clone()->set_tensor_shape(shape_interleaved);
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(&im2_col_info, &input_interleaved_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&input_interleaved_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo(shape_im2col[1],            // m
+                                                                             weights->tensor_shape()[0], // n
+                                                                             shape_im2col[0]) /* k */));
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&im2_col_info, weights, &gemm_output_info, 1.f, is_interleaved, GEMMReshapeInfo()));
+        }
+    }
+    if(!is_nhwc)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(idx_width) != conv_w) || (output->dimension(idx_height) != conv_h), "Output shape does not match the expected one");
+
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
     }
 
     return Status{};
@@ -605,19 +587,33 @@
     // Run weights reshaping (Runs once for every configure)
     if(!_are_weights_reshaped)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
         _are_weights_reshaped = true;
         _reshape_weights.run();
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
     }
 
     _memory_group.acquire();
 
-    // Run input reshaping
-    NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
+    if(!_skip_im2col)
+    {
+        // Run input reshaping
+        unsigned int _y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+        NEScheduler::get().schedule(&_input_im2col_kernel, _y_dim);
+    }
 
     // Runs matrix multiply on reshaped matrices
-    if(_mm_optimised_kernel != nullptr)
+    if(_asm_glue._optimised_kernel != nullptr)
     {
-        NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+        _asm_glue.run();
+        // Release weights in case buffer is pretransposed
+        if(!_weights_reshaped.is_used())
+        {
+            _weights_reshaped.allocator()->free();
+        }
     }
     else
     {
@@ -638,6 +634,11 @@
         }
     }
 
+    if(_skip_im2col && _append_bias)
+    {
+        NEScheduler::get().schedule(&_add_bias_kernel, Window::DimY);
+    }
+
     // Run output stage for quantized case
     if(_is_quantized)
     {
@@ -645,7 +646,15 @@
     }
 
     // Reshape output matrix
-    NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+    if(_data_layout == DataLayout::NCHW)
+    {
+        NEScheduler::get().schedule(&_output_col2im_kernel, Window::DimY);
+    }
+
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
 
     _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index 9b36e81..98b4767 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp

@@ -1,4 +1,4 @@
-/* Copyright (c) 2017 ARM Limited.
+/* Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,13 +25,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
@@ -39,20 +35,11 @@
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "support/ToolchainSupport.h"
 
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp"
-} // namespace arm_compute
-
 using namespace arm_compute;
 
 NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(), _workspace()
+    : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(),
+      _workspace(), _B_pretransposed()
 {
 }
 
@@ -65,89 +52,29 @@
     ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
     ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
 
+    bool run_optimised = false;
 #ifdef __aarch64__
-    const int            M                   = output->info()->tensor_shape().y();
-    const int            N                   = output->info()->tensor_shape().x();
-    const int            K                   = a->info()->tensor_shape().x();
-    constexpr size_t     workspace_alignment = 4096;
-    const struct CPUInfo ci                  = NEScheduler::get().cpu_info();
+    switch(a->info()->data_type())
+    {
+        case DataType::S8:
+        {
+            run_optimised = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue_signed);
+            break;
+        }
+        case DataType::QASYMM8:
+        case DataType::U8:
+        {
+            run_optimised = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretransposed, _memory_group, _asm_glue_unsigned);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Datatype not supported");
+            break;
+        }
+    }
 #endif /* __aarch64__ */
-
-#ifdef ARM_COMPUTE_AARCH64_V8_2
-    if(ci.CPU == CPUTarget::A75_DOT || ci.CPU == CPUTarget::A55_DOT)
-    {
-        // Configure matrix multiply kernel
-        GemmInterleaved<gemm_s8_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
-        _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
-        _memory_group.manage(&_workspace);
-
-        // Configure matrix multiplication kernel
-        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
-        k->configure(a, b, output, &_workspace, 1.f, 1.f);
-        _mm_kernel = std::move(k);
-        _workspace.allocator()->allocate();
-    }
-    else
-#elif defined(ARM_COMPUTE_AARCH64_V8A)
-    if(ci.CPU == CPUTarget::A53)
-    {
-        switch(a->info()->data_type())
-        {
-            case DataType::S8:
-            {
-                // Configure matrix multiply kernel
-                GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
-                _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
-            }
-            break;
-            case DataType::U8:
-            {
-                // Configure matrix multiply kernel
-                GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&ci, M, N, K, false, false);
-                _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
-            }
-            break;
-            default:
-                ARM_COMPUTE_ERROR("Datatype not supported");
-        }
-
-        _memory_group.manage(&_workspace);
-        // Configure matrix multiplication kernel
-        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64A53Kernel>();
-        k->configure(a, b, output, &_workspace, 1.f, 1.f);
-        _mm_kernel = std::move(k);
-        _workspace.allocator()->allocate();
-    }
-    else if(1) // Generic v8a kernel
-    {
-        switch(a->info()->data_type())
-        {
-            case DataType::S8:
-            {
-                // Configure matrix multiply kernel
-                GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
-                _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
-            }
-            break;
-            case DataType::U8:
-            {
-                // Configure matrix multiply kernel
-                GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&ci, M, N, K, false, false);
-                _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
-            }
-            break;
-            default:
-                ARM_COMPUTE_ERROR("Datatype not supported");
-        }
-        _memory_group.manage(&_workspace);
-        // Configure matrix multiplication kernel
-        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64Kernel>();
-        k->configure(a, b, output, &_workspace, 1.f, 1.f);
-        _mm_kernel = std::move(k);
-        _workspace.allocator()->allocate();
-    }
-    else
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+    if(!run_optimised)
     {
         // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
         TensorShape shape_tmp_a = a->info()->tensor_shape();
@@ -206,7 +133,18 @@
         NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
     }
 
-    NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+    if(_asm_glue_unsigned._optimised_kernel != nullptr)
+    {
+        _asm_glue_unsigned.run();
+    }
+    else if(_asm_glue_signed._optimised_kernel != nullptr)
+    {
+        _asm_glue_signed.run();
+    }
+    else
+    {
+        NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+    }
 
     _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index ad47593..2e06fa2 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp

@@ -26,11 +26,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
@@ -39,58 +37,48 @@
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "support/ToolchainSupport.h"
 
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp"
-} // namespace arm_compute
-
 using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
 
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
-      _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _dot_product_path(false)
+    : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
+      _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _B_pretranspose(), _a_offset(0), _b_offset(0),
+      _run_vector_matrix_multiplication(false), _dot_product_path(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
 {
 }
 
 void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-    ARM_COMPUTE_UNUSED(gemm_info);
     ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
 
     _a_offset                         = a->info()->quantization_info().offset;
     _b_offset                         = b->info()->quantization_info().offset;
     _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+    _reshape_b_only_on_first_run      = gemm_info.reshape_b_only_on_first_run();
 
-#ifdef ARM_COMPUTE_AARCH64_V8_2
-    // Check for DOT product instruction
-    const struct CPUInfo ci              = NEScheduler::get().cpu_info();
-    const int            cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
-
-    if(cpu_has_dotprod != 0)
+#ifdef __aarch64__
+    switch(a->info()->data_type())
     {
-        _dot_product_path = true;
-
-        // Configure matrix multiply kernel
-        struct CPUInfo ci = NEScheduler::get().cpu_info();
-        const int      M  = output->info()->tensor_shape().y();
-        const int      N  = output->info()->tensor_shape().x();
-        const int      K  = a->info()->tensor_shape().x();
-
-        const size_t     workbench_size = GemmInterleaved<gemm_u8_12x8, gemm_u8_12x8::operand_type, gemm_u8_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
-        constexpr size_t alignment      = 4096;
-        _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
-        _memory_group.manage(&_workspace);
-
-        // Configure matrix multiplication kernel
-        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
-        k->configure(a, b, output, &_workspace, 1.f, 1.f, false, false);
-        _mm_kernel = std::move(k);
+        case DataType::S8:
+        {
+            _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_signed);
+            break;
+        }
+        case DataType::QASYMM8:
+        case DataType::U8:
+        {
+            _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_unsigned);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Datatype not supported");
+            break;
+        }
     }
-    else
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+#endif /* __aarch64__ */
+    if(!_dot_product_path)
     {
         if(_run_vector_matrix_multiplication)
         {
@@ -110,7 +98,10 @@
             _tmp_a.allocator()->init(info_a);
             _tmp_b.allocator()->init(info_b);
             _memory_group.manage(&_tmp_a);
-            _memory_group.manage(&_tmp_b);
+            if(!_reshape_b_only_on_first_run)
+            {
+                _memory_group.manage(&_tmp_b);
+            }
 
             // Configure interleave kernel
             {
@@ -141,7 +132,10 @@
         TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
 
         _vector_sum_col.allocator()->init(info_vector_sum_col);
-        _memory_group.manage(&_vector_sum_col);
+        if(!_reshape_b_only_on_first_run)
+        {
+            _memory_group.manage(&_vector_sum_col);
+        }
 
         // Configure Matrix B reduction kernel
         _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);
@@ -168,10 +162,6 @@
         _tmp_a.allocator()->allocate();
         _tmp_b.allocator()->allocate();
     }
-    else
-    {
-        _workspace.allocator()->allocate();
-    }
 
     if(_a_offset != 0)
     {
@@ -203,42 +193,28 @@
     int32_t b_offset                         = b->quantization_info().offset;
     bool    run_vector_matrix_multiplication = a->dimension(1) < 2;
 
-#ifdef ARM_COMPUTE_AARCH64_V8_2
-    // Check for DOT product instruction
-    const struct CPUInfo ci              = NEScheduler::get().cpu_info();
-    const int            cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
-
-    if(cpu_has_dotprod != 0)
+    if(!run_vector_matrix_multiplication)
     {
-        // Validate matrix multiply kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpAArch64V8P4Kernel::validate(a, b, output));
+        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+        TensorShape shape_tmp_a = a->tensor_shape();
+        shape_tmp_a.set(0, a->dimension(0) * 4);
+        shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+        TensorShape shape_tmp_b = b->tensor_shape();
+        shape_tmp_b.set(0, b->dimension(1) * 16);
+        shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+        TensorInfo info_a(shape_tmp_a, 1, a->data_type());
+        TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
     }
     else
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
     {
-        if(!run_vector_matrix_multiplication)
-        {
-            // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-            TensorShape shape_tmp_a = a->tensor_shape();
-            shape_tmp_a.set(0, a->dimension(0) * 4);
-            shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
-
-            // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-            TensorShape shape_tmp_b = b->tensor_shape();
-            shape_tmp_b.set(0, b->dimension(1) * 16);
-            shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
-
-            TensorInfo info_a(shape_tmp_a, 1, a->data_type());
-            TensorInfo info_b(shape_tmp_b, 1, b->data_type());
-
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
-        }
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
     }
 
     TensorInfo info_vector_sum_col, info_vector_sum_row;
@@ -282,13 +258,24 @@
             NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
         }
 
-        if(_mtx_b_reshape_kernel)
+        if(_mtx_b_reshape_kernel && (_is_first_run || !_reshape_b_only_on_first_run))
         {
             NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
         }
     }
 
-    NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+    if(_asm_glue_unsigned._optimised_kernel != nullptr)
+    {
+        _asm_glue_unsigned.run();
+    }
+    else if(_asm_glue_signed._optimised_kernel != nullptr)
+    {
+        _asm_glue_signed.run();
+    }
+    else
+    {
+        NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+    }
 
     // Run matrix A reduction kernel only if _b_offset is not equal to 0
     if(_b_offset != 0)
@@ -297,7 +284,7 @@
     }
 
     // Run matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0)
+    if(_a_offset != 0 && (_is_first_run || !_reshape_b_only_on_first_run))
     {
         NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
     }
@@ -306,4 +293,6 @@
     NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
 
     _memory_group.release();
+
+    _is_first_run = false;
 }

diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
index b962db9..6b95cb0 100644
--- a/src/runtime/NEON/functions/NEIm2Col.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp

@@ -23,19 +23,30 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEIm2Col.h"
 
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected)
+NEIm2Col::NEIm2Col()
+    : _kernel(), _y_dim(1)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
-    k->configure(input, output, kernel_dims, conv_info, has_bias, is_fully_connected);
-    _kernel = std::move(k);
 }
 
-Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected)
+void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
 {
-    return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, is_fully_connected);
+    _y_dim = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+    _kernel.configure(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+}
+
+Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
+{
+    return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+}
+
+void NEIm2Col::run()
+{
+    NEScheduler::get().schedule(&_kernel, _y_dim);
 }

diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index fa62483..d0b80fb 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,6 +46,26 @@
     _sumsq.allocator()->allocate();
 }
 
+Status NEL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, float epsilon)
+{
+    TensorShape shape(input->tensor_shape());
+
+    // Create intermediate tensor info
+    TensorInfo sum_sq;
+    sum_sq.set_data_type(input->data_type());
+    sum_sq.set_tensor_shape(shape);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, axis, ReductionOperation::SUM_SQUARE));
+
+    // Reduce shape on axis (supported axis is 0)
+    shape.set(0, 1);
+    sum_sq.set_tensor_shape(shape);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEL2NormalizeLayerKernel::validate(input, &sum_sq, output, axis, epsilon));
+
+    return Status{};
+}
+
 void NEL2NormalizeLayer::run()
 {
     _memory_group.acquire();

diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 45ddb70..913acf8 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp

@@ -33,39 +33,102 @@
 
 using namespace arm_compute;
 
+namespace
+{
+void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                      TensorShape &shape_wr, TensorShape &shape_im2col, TensorShape &shape_gemm)
+{
+    ARM_COMPUTE_UNUSED(output);
+
+    const unsigned int kernel_width  = weights->dimension(0);
+    const unsigned int kernel_height = weights->dimension(1);
+
+    bool has_bias = (biases != nullptr);
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
+
+    const size_t mat_weights_cols = weights->dimension(3);
+    const size_t mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + ((has_bias) ? 1 : 0);
+    const size_t mat_weights_num  = weights->dimension(4);
+
+    shape_wr = TensorShape(mat_weights_cols, mat_weights_rows, mat_weights_num);
+
+    const size_t mat_input_cols = mat_weights_rows;
+    const size_t mat_input_rows = conv_w * conv_h;
+
+    shape_im2col = input->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+
+    shape_gemm = shape_im2col;
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+}
+} // namespace
+
 NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
-      _is_first_run(false)
+      _is_first_run(false), _original_weights(nullptr)
 {
 }
 
+Status NELocallyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON(!conv_info.padding_is_symmetric());
+
+    bool has_bias = (biases != nullptr);
+
+    if(has_bias)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 2);
+    }
+
+    const unsigned int kernel_width  = weights->dimension(0);
+    const unsigned int kernel_height = weights->dimension(1);
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != conv_w) || (output->dimension(1) != conv_h), "Output shape does not match the expected one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
+
+    // Calculate intermediate buffer shapes
+    TensorShape shape_wr;
+    TensorShape shape_im2col;
+    TensorShape shape_gemm;
+    calculate_shapes(input, weights, biases, output, conv_info, shape_wr, shape_im2col, shape_gemm);
+
+    TensorInfo weights_reshaped_info(shape_wr, 1, weights->data_type());
+    TensorInfo input_im2col_reshaped_info(shape_im2col, 1, input->data_type());
+    TensorInfo gemm_output_info(shape_gemm, 1, input->data_type());
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias, false));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NELocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
+
+    return Status{};
+}
+
 void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(!conv_info.padding_is_symmetric());
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NELocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
 
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 2);
-    }
-
-    bool _has_bias = (biases != nullptr);
-    _is_first_run  = true;
-
-    // Get parameters for conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    unsigned int pad_x    = 0;
-    unsigned int pad_y    = 0;
-    std::tie(stride_x, stride_y) = conv_info.stride();
-    std::tie(pad_x, pad_y)       = conv_info.pad();
+    bool _has_bias    = (biases != nullptr);
+    _is_first_run     = true;
+    _original_weights = weights;
 
     const unsigned int kernel_width  = weights->info()->dimension(0);
     const unsigned int kernel_height = weights->info()->dimension(1);
@@ -76,32 +139,14 @@
     std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
                                                  conv_info);
 
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
-    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(4) != (conv_w * conv_h), "Weights shape does not match the expected one");
-
-    // Create tensor to store the reshaped weights
-    const size_t mat_weights_cols = weights->info()->dimension(3);
-    const size_t mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + ((_has_bias) ? 1 : 0);
-    const size_t mat_weights_num  = weights->info()->dimension(4);
-
-    const TensorShape shape_wr(mat_weights_cols, mat_weights_rows, mat_weights_num);
+    // Calculate intermediate buffer shapes
+    TensorShape shape_wr;
+    TensorShape shape_im2col;
+    TensorShape shape_gemm;
+    calculate_shapes(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info, shape_wr, shape_im2col, shape_gemm);
 
     _weights_reshaped.allocator()->init(TensorInfo(shape_wr, 1, weights->info()->data_type()));
-
-    // Create tensor to store im2col reshaped inputs
-    const size_t mat_input_cols = mat_weights_rows;
-    const size_t mat_input_rows = conv_w * conv_h;
-    TensorShape  shape_im2col   = input->info()->tensor_shape();
-    shape_im2col.set(0, mat_input_cols);
-    shape_im2col.set(1, mat_input_rows);
-    shape_im2col.set(2, 1);
-
     _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, input->info()->data_type()));
-
-    // Create locally connected layer output tensor
-    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
-    shape_gemm.set(0, mat_weights_cols);
-    shape_gemm.set(1, mat_input_rows);
     _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, input->info()->data_type()));
 
     // Manage intermediate buffers
@@ -125,8 +170,13 @@
     // Run weights reshaping (Runs once for every configure)
     if(_is_first_run)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
         _is_first_run = false;
         NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+
+        // Mark original weights tensor as unused
+        _original_weights->mark_as_unused();
     }
 
     _memory_group.acquire();

diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 5a474e4..cf6b984 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 #include "support/ToolchainSupport.h"
 
@@ -30,11 +31,21 @@
 
 using namespace arm_compute;
 
-void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+void NEPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
     k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
     _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
 }
 Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {

diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index bc0b6f8..cbfd684 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp

@@ -31,7 +31,7 @@
 using namespace arm_compute;
 
 NEPoolingLayer::NEPoolingLayer()
-    : _pooling_layer_kernel(), _border_handler(), _is_global_pooling_layer(false)
+    : _pooling_layer_kernel(), _border_handler(), _is_global_pooling_layer(false), _data_layout(DataLayout::NCHW)
 {
 }
 
@@ -40,17 +40,31 @@
     // Check if we have Global Pooling Layer
     _is_global_pooling_layer = (input->info()->dimension(0) == pool_info.pool_size().width) && (input->info()->dimension(1) == pool_info.pool_size().height);
 
+    // Get data layout
+    _data_layout = input->info()->data_layout();
+
     // Configure pooling kernel
     _pooling_layer_kernel.configure(input, output, pool_info);
 
-    // Configure border depending on operation required (quantize border in case of asymmetric data_type)
-    BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-    PixelValue zero_value(0.f);
-    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
+    switch(_data_layout)
     {
-        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+        case DataLayout::NCHW:
+        {
+            // Configure border depending on operation required (quantize border in case of asymmetric data_type)
+            BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+            PixelValue zero_value(0.f);
+            if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
+            {
+                zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+            }
+            _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, zero_value);
+            break;
+        }
+        case DataLayout::NHWC:
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data layout not supported");
     }
-    _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, zero_value);
 }
 
 Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
@@ -60,9 +74,20 @@
 
 void NEPoolingLayer::run()
 {
-    // Fill border
-    NEScheduler::get().schedule(&_border_handler, Window::DimY);
+    switch(_data_layout)
+    {
+        case DataLayout::NCHW:
+            // Fill border
+            NEScheduler::get().schedule(&_border_handler, Window::DimY);
 
-    // Run pooling layer
-    NEScheduler::get().schedule(&_pooling_layer_kernel, _is_global_pooling_layer ? Window::DimZ : Window::DimY);
+            // Run pooling layer
+            NEScheduler::get().schedule(&_pooling_layer_kernel, _is_global_pooling_layer ? Window::DimZ : Window::DimY);
+            break;
+        case DataLayout::NHWC:
+            // Run pooling layer
+            NEScheduler::get().schedule(&_pooling_layer_kernel, Window::DimX);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data layout not supported");
+    }
 }
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index a131c48..8f7db96 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 using namespace arm_compute;
@@ -34,8 +35,21 @@
 {
 }
 
+Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+    TensorInfo min_max{ input->num_channels(), input->data_type() };
+    ARM_COMPUTE_RETURN_ON_ERROR(NEMinMaxLayerKernel::validate(input, &min_max));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(input, output, &min_max));
+
+    return Status{};
+}
+
 void NEQuantizationLayer::configure(const ITensor *input, ITensor *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
     // Configure min-max kernel. _min_max tensor will be auto-configured within the kernel
     _min_max_kernel.configure(input, &_min_max);
 

diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index f1a9145..cd0b42f 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,6 +63,13 @@
 {
 }
 
+Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, axis, op));
+
+    return Status{};
+}
+
 void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);

diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index bd565c9..a9c85bd 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,6 @@
 void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size, SamplingPolicy sampling_policy)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == offsets);
-    ARM_COMPUTE_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
     ARM_COMPUTE_UNUSED(sampling_policy);
 
     Window win;
@@ -66,7 +65,7 @@
             const int   in_xi = std::floor(in_x);
             const int   in_yi = std::floor(in_y);
 
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
+            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * static_cast<int>(input_element_size);
             *reinterpret_cast<float *>(dx_it.ptr())        = in_x - in_xi;
             *reinterpret_cast<float *>(dy_it.ptr())        = in_y - in_yi;
         },
@@ -99,20 +98,20 @@
 
 void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
 {
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), policy, border_mode, constant_border_value, sampling_policy));
 
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
-    }
+    // Get data layout and width/height indices
+    const DataLayout data_layout = input->info()->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Get the tensor shape
-    const TensorShape shape(output->info()->dimension(0), output->info()->dimension(1));
+    const TensorShape shape(output->info()->dimension(idx_width), output->info()->dimension(idx_height));
 
     // Compute the ratio between source width/height and destination width/height
-    const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
-    const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
+    const auto wr = static_cast<float>(input->info()->dimension(idx_width)) / static_cast<float>(output->info()->dimension(idx_width));
+    const auto hr = static_cast<float>(input->info()->dimension(idx_height)) / static_cast<float>(output->info()->dimension(idx_height));
 
     // Get the element size of the input image
     const size_t input_element_size = input->info()->element_size();
@@ -123,9 +122,6 @@
         policy = InterpolationPolicy::NEAREST_NEIGHBOR;
     }
 
-    // Check if the border mode is UNDEFINED
-    const bool border_undefined = border_mode == BorderMode::UNDEFINED;
-
     switch(policy)
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
@@ -133,7 +129,7 @@
             TensorInfo tensor_info_offsets(shape, Format::S32);
             _offsets.allocator()->init(tensor_info_offsets);
 
-            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined, sampling_policy);
+            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_mode, sampling_policy);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -151,7 +147,7 @@
             _dx.allocator()->init(tensor_info_dxdy);
             _dy.allocator()->init(tensor_info_dxdy);
 
-            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined, sampling_policy);
+            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_mode, sampling_policy);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -164,7 +160,7 @@
         }
         case InterpolationPolicy::AREA:
         {
-            _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_undefined);
+            _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_mode);
             break;
         }
         default:
@@ -174,6 +170,48 @@
     _border_handler.configure(input, _scale_kernel.border_size(), border_mode, PixelValue(constant_border_value));
 }
 
+Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
+                         BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+    ARM_COMPUTE_UNUSED(border_mode, constant_border_value);
+
+    ITensorInfo *offsets = nullptr;
+    ITensorInfo *dx      = nullptr;
+    ITensorInfo *dy      = nullptr;
+
+    // Get data layout and width/height indices
+    const DataLayout data_layout = input->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    // Get the tensor shape of auxilary buffers
+    const TensorShape shape(output->dimension(idx_width), output->dimension(idx_height));
+
+    TensorInfo tensor_info_offsets(shape, Format::S32);
+    TensorInfo tensor_info_dx(shape, Format::F32);
+    TensorInfo tensor_info_dy(shape, Format::F32);
+
+    switch(policy)
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+            offsets = &tensor_info_offsets;
+            break;
+        case InterpolationPolicy::BILINEAR:
+            offsets = &tensor_info_offsets;
+            dx      = &tensor_info_dx;
+            dy      = &tensor_info_dy;
+            break;
+        default:
+            break;
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEScaleKernel::validate(input->clone().get(), dx, dy, offsets, output->clone().get(),
+                                                        policy, border_mode, sampling_policy));
+    return Status{};
+}
+
 void NEScale::run()
 {
     NEScheduler::get().schedule(&_border_handler, Window::DimZ);

diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
new file mode 100644
index 0000000..8f2c4c4
--- /dev/null
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp

@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
+
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+
+namespace arm_compute
+{
+namespace
+{
+inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
+{
+    const DataLayout data_layout = input->info()->data_layout();
+    const int        in_width    = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
+    const int        in_height   = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
+    const int        in_channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+    const int        in_batches  = input->info()->dimension(3);
+
+    return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+    const DataLayout   data_layout = input->data_layout();
+    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    ARM_COMPUTE_UNUSED(output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW); // COMPMID-1162
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 3 && weights->dimension(height_idx) != 5, "Only 3 and 5 kernels are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    return Status{};
+}
+
+Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims)
+{
+    Size2D output_tile = Size2D{};
+
+    if(kernel_dims == Size2D(3U, 3U))
+    {
+        output_tile = (input_dims.width <= 4 && input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+    }
+    else if(kernel_dims == Size2D(5U, 5U))
+    {
+        output_tile = Size2D(2U, 2U);
+    }
+
+    return output_tile;
+}
+
+bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
+{
+    // Check if we want to configure a Winograd configuration which requires fast math
+    using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+    std::vector<WinogradConfiguration> fast_math_winograd =
+    {
+        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)),
+        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
+    };
+
+    auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
+                            std::pair<int, int>(kernel_size.width, kernel_size.height));
+
+    return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
+}
+} //namespace
+
+NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _arm_gemm(nullptr), _gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr),
+      _activationlayer_function(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(),
+      _workspace(), _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false)
+{
+} /* arm_compute */
+
+void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
+                                           bool enable_fast_math)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info));
+
+    // Get indices for the width and height
+    const DataLayout   data_layout = input->info()->data_layout();
+    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    const Size2D input_dims  = Size2D(input->info()->dimension(width_idx), input->info()->dimension(height_idx));
+    const Size2D kernel_size = Size2D(weights->info()->dimension(width_idx), weights->info()->dimension(height_idx));
+    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+    // Check if the Winograd configuration requires fast math
+    if(!enable_fast_math)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+    }
+
+    _weights = weights;
+    _input   = input;
+    _output  = output;
+
+    std::unique_ptr<INEWinogradLayerTransformInputKernel<float>>   transform_input_kernel;
+    std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
+    std::unique_ptr<INEWinogradLayerTransformOutputKernel<float>>  transform_output_kernel;
+
+    int n_gemms = 0;
+    int N_BLOCK = 0; // Size of block used by GEMM.
+
+    switch(kernel_size.width)
+    {
+        case 3:
+        {
+            if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4)
+            {
+                transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>>();
+                transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>>();
+                transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>>();
+                n_gemms                  = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradBase::N_GEMMS;
+                N_BLOCK                  = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradConv::N_BLOCK;
+            }
+            else
+            {
+                transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>>();
+                transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>>();
+                transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>>();
+                n_gemms                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradBase::N_GEMMS;
+                N_BLOCK                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradConv::N_BLOCK;
+            }
+            break;
+        }
+        case 5:
+        {
+            transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>>();
+            transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>>();
+            transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>>();
+            n_gemms                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradBase::N_GEMMS;
+            N_BLOCK                  = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradConv::N_BLOCK;
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+        }
+    }
+
+    const PaddingType use_padding_type = (conv_info.pad_left() != 0u) ? PADDING_SAME : PADDING_VALID;
+    const bool        use_same_padding = use_padding_type == PADDING_SAME;
+
+    // Get convolved dimensions
+    const int in_channels  = input->info()->dimension(channel_idx);
+    const int out_channels = output->info()->dimension(channel_idx);
+
+    const Tensor4DShape in_shape(internal_get_input_shape(input));
+    const size_t        data_type_size = input->info()->element_size();
+    // Get the memory required to instantiate a new Winograd operator.
+    constexpr size_t storage_alignment   = 64;
+    const size_t     kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
+    _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
+    _kernel_storage.allocator()->allocate();
+    // Input storage
+    const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
+    _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
+    _input_workspace.allocator()->allocate();
+
+    // Output storage
+    const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, use_same_padding) * data_type_size;
+    _output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8));
+    _output_workspace.allocator()->allocate();
+
+    // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
+    TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
+                                _output->info()->dimension(1), _output->info()->dimension(3)),
+                    1, _output->info()->data_type());
+    _output_nhwc.allocator()->init(info);
+    _output_nhwc.allocator()->allocate();
+
+    // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+    _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
+    _weights_hwio.allocator()->allocate();
+
+    // configure the kernel to transform the input tensor from NCHW -> NHWC
+    _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+    _input_nhwc.allocator()->allocate();
+
+    const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
+
+    // Configure the InputTransform
+    const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
+    transform_input_kernel->configure(reinterpret_cast<float *>(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
+                                      reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_stride);
+
+    // Configure WeightsTransform
+    const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
+    transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast<float *>(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels);
+
+    // Configure OutputTransform
+    //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
+    const int  output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
+    const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
+
+    transform_output_kernel->configure(biases, reinterpret_cast<float *>(_output_workspace.buffer()),
+                                       output_matrix_stride, reinterpret_cast<float *>(_output_nhwc.buffer()),
+                                       in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
+
+    // Configure GEMM
+    const int    tile_rows                = iceildiv(output_shape.n_rows, output_tile.height);
+    const int    tile_cols                = iceildiv(output_shape.n_cols, output_tile.width);
+    const int    m                        = in_shape.n_batches * tile_rows * tile_cols;
+    const int    k                        = in_shape.n_channels;
+    const int    n                        = out_channels;
+    const int    input_matrix_row_stride  = in_shape.n_channels;
+    const int    kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
+    const int    output_matrix_row_stride = kernel_matrix_row_stride;
+    unsigned int num_threads              = NEScheduler::get().num_threads();
+
+    _arm_gemm = arm_gemm::gemm<float, float>(NEScheduler::get().cpu_info(), m, n, k, 1, n_gemms, false, false, 1.f, 0.f, num_threads, false);
+    _arm_gemm->set_arrays(reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_row_stride, 0, input_matrix_stride, reinterpret_cast<float *>(_kernel_storage.buffer()),
+                          kernel_matrix_row_stride, kernel_matrix_stride, reinterpret_cast<float *>(_output_workspace.buffer()), output_matrix_row_stride, 0, output_matrix_stride);
+
+    auto acl_gemm_wrapper = support::cpp14::make_unique<NEGEMMAssemblyWrapper<arm_gemm::GemmCommon<float, float>>>();
+    acl_gemm_wrapper->configure(_arm_gemm.get());
+    const size_t workspace_size = _arm_gemm->get_working_size();
+
+    // Allocate workspace
+    if(workspace_size > 0)
+    {
+        const unsigned int alignment = 4096;
+        allocate_workspace(workspace_size, _workspace, &_memory_group, alignment, 1);
+        _arm_gemm->set_working_space(reinterpret_cast<float *>(_workspace.buffer()));
+    }
+
+    const unsigned int window_size = _arm_gemm->get_window_size();
+    if(window_size < num_threads)
+    {
+        num_threads = window_size;
+        _arm_gemm->set_nthreads(num_threads);
+    }
+
+    _gemm_kernel = std::move(acl_gemm_wrapper);
+
+    // Reorder the convoluted output to ACL's ordering NCHW
+    _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
+
+    _transform_input_kernel   = std::move(transform_input_kernel);
+    _transform_weights_kernel = std::move(transform_weights_kernel);
+    _transform_output_kernel  = std::move(transform_output_kernel);
+
+    //Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
+}
+
+void NEWinogradConvolutionLayer::run()
+{
+    _memory_group.acquire();
+    if(!_reshaped_kernel)
+    {
+        _reshaped_kernel = true;
+        _permute_weights.run();
+        NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
+    }
+    //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
+    _permute_input.run();
+
+    // Transform input tensor to the winograd domain
+    NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
+
+    //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
+    NEScheduler::get().schedule(_gemm_kernel.get(), Window::DimX);
+
+    // Transform output tensor to the spatial domain
+    NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
+
+    // Reorder the convoluted output to ACL's ordering NCHW
+    _permute_output.run();
+
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
+    _memory_group.release();
+}
+
+Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                            const ActivationLayerInfo &act_info, bool enable_fast_math)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
+
+    // Get indices for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D input_dims  = Size2D(input->dimension(idx_width), input->dimension(idx_height));
+    const Size2D kernel_size = Size2D(weights->dimension(idx_width), weights->dimension(idx_height));
+    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+    // Check if the Winograd configuration requires fast math
+    if(!enable_fast_math)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+    }
+
+    const WinogradInfo winograd_info = WinogradInfo(output_tile,
+                                                    kernel_size,
+                                                    input_dims,
+                                                    conv_info,
+                                                    input->data_layout());
+
+    // Validate input transform
+    const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+    const TensorInfo  input0       = input->clone()->set_tensor_shape(input0_shape);
+    switch(weights->dimension(idx_width))
+    {
+        case 3:
+        {
+            if(input_dims.width > 4 && input_dims.height > 4)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>::validate(input, &input0, winograd_info)));
+            }
+            else
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>::validate(input, &input0, winograd_info)));
+            }
+            break;
+        }
+        case 5:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>::validate(input, &input0, winograd_info)));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
+            break;
+        }
+    }
+    // Validate filter transform
+    const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
+    const TensorInfo  input1       = weights->clone()->set_tensor_shape(input1_shape);
+
+    switch(weights->dimension(idx_width))
+    {
+        case 3:
+        {
+            if(input_dims.width > 4 && input_dims.height > 4)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, &input1, winograd_info)));
+            }
+            else
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, &input1, winograd_info)));
+            }
+            break;
+        }
+        case 5:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, &input1, winograd_info)));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
+            break;
+        }
+    }
+    // Validate batched matrix multiply
+    TensorShape batched_mm_output_shape = input0.tensor_shape();
+    batched_mm_output_shape[0]          = input1.tensor_shape()[0];
+    const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
+    switch(weights->dimension(idx_width))
+    {
+        case 3:
+        {
+            if(input_dims.width > 4 && input_dims.height > 4)
+            {
+                // Validate output transform
+                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>::validate(&batched_mm_output, biases, output, winograd_info)));
+            }
+            else
+            {
+                // Validate output transform
+                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>::validate(&batched_mm_output, biases, output, winograd_info)));
+            }
+            break;
+        }
+        case 5:
+        {
+            // Validate output transform
+            ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>::validate(&batched_mm_output, biases, output, winograd_info)));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
+            break;
+        }
+    }
+
+    // Validate Activation Layer
+    if(act_info.enabled())
+    {
+        NEActivationLayer::validate(output, nullptr, act_info);
+    }
+    return Status{};
+}
+
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp
deleted file mode 100644
index 0ac6d09..0000000
--- a/src/runtime/NEON/functions/NEWinogradLayer.cpp
+++ /dev/null

@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEWinogradLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "support/ToolchainSupport.h"
-
-#include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h"
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-
-namespace
-{
-inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
-{
-    const int in_width    = input->info()->dimension(0);
-    const int in_height   = input->info()->dimension(1);
-    const int in_batches  = input->info()->dimension(3);
-    const int in_channels = input->info()->dimension(2);
-    return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
-}
-} /* namespace */
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, biases);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != 3 && weights->dimension(0) != 5, "Only 3 and 5 kernels are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    // Get parameters from conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    std::tie(stride_x, stride_y) = conv_info.stride();
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides.");
-
-    ARM_COMPUTE_UNUSED(output);
-
-    return Status{};
-}
-} //namespace
-
-NEWinogradLayer::NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _batched_gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _permute_input(),
-      _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(),
-      _reshaped_kernel(false)
-{
-} /* arm_compute */
-
-void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, biases, output);
-    ARM_COMPUTE_UNUSED(conv_info);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), biases->info(), output->info(), conv_info));
-
-    _weights = weights;
-    _input   = input;
-    _output  = output;
-
-    std::unique_ptr<INEWinogradLayerBatchedGEMMKernel<float, float>> batched_gemm_kernel;
-    std::unique_ptr<INEWinogradLayerTransformInputKernel<float>>   transform_input_kernel;
-    std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
-    std::unique_ptr<INEWinogradLayerTransformOutputKernel<float>>  transform_output_kernel;
-
-    switch(weights->info()->dimension(0))
-    {
-        case 3:
-        {
-            batched_gemm_kernel      = support::cpp14::make_unique<NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>>();
-            transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>>();
-            transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>>();
-            transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>>();
-            break;
-        }
-        case 5:
-        {
-            batched_gemm_kernel      = support::cpp14::make_unique<NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>>();
-            transform_input_kernel   = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>>();
-            transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>>();
-            transform_output_kernel  = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>>();
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Not supported.");
-            break;
-        }
-    }
-
-    const PaddingType use_padding_type = (conv_info.pad_left() != 0u) ? PADDING_SAME : PADDING_VALID;
-    const bool        use_same_padding = use_padding_type == PADDING_SAME;
-
-    // Get parameters from conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    std::tie(stride_x, stride_y) = conv_info.stride();
-    ARM_COMPUTE_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides.");
-
-    // Get convolved dimensions
-    const int in_channels  = input->info()->dimension(2);
-    const int out_channels = output->info()->dimension(2);
-
-    const Tensor4DShape in_shape(internal_get_input_shape(input));
-    const size_t        data_type_size = input->info()->element_size();
-    // Get the memory required to instantiate a new Winograd operator.
-    constexpr size_t storage_alignment   = 64;
-    const size_t     kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
-    _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
-    _kernel_storage.allocator()->allocate();
-    // Input storage
-    const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
-    _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
-    _input_workspace.allocator()->allocate();
-
-    // Output storage
-    const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, use_same_padding) * data_type_size;
-    _output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8));
-    _output_workspace.allocator()->allocate();
-
-    // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
-    TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
-                                _output->info()->dimension(1), _output->info()->dimension(3)),
-                    1, _output->info()->data_type());
-    _output_nhwc.allocator()->init(info);
-    _output_nhwc.allocator()->allocate();
-
-    // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
-    _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
-    _weights_hwio.allocator()->allocate();
-
-    // configure the kernel to transform the input tensor from NCHW -> NHWC
-    _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
-    _input_nhwc.allocator()->allocate();
-
-    const int         weights_width  = weights->info()->dimension(0);
-    const int         weights_height = weights->info()->dimension(1);
-    const KernelShape kernel_shape({ out_channels, weights_height, weights_width, in_channels });
-
-    // Configure the InputTransform
-    const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
-    transform_input_kernel->configure(reinterpret_cast<float *>(_input_nhwc.buffer()), in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
-                                      reinterpret_cast<float *>(_input_workspace.buffer()), input_matrix_stride);
-
-    // Configure WeightsTransform
-    const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
-    transform_weights_kernel->configure(&_weights_hwio, reinterpret_cast<float *>(_kernel_storage.buffer()), kernel_matrix_stride, out_channels, in_channels);
-
-    // Configure OutputTransform
-    //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
-    const int  output_matrix_stride = transform_output_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
-    const auto output_shape(transform_output_kernel->get_output_shape(kernel_shape, in_shape, use_padding_type));
-
-    transform_output_kernel->configure(biases, reinterpret_cast<float *>(_output_workspace.buffer()),
-                                       output_matrix_stride, reinterpret_cast<float *>(_output_nhwc.buffer()),
-                                       in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
-
-    // Configure Batched GEMMs
-    const int      output_tile_rows         = batched_gemm_kernel->get_output_tile_rows();
-    const int      output_tile_cols         = batched_gemm_kernel->get_output_tile_cols();
-    const int      n_block                  = batched_gemm_kernel->get_number_blocks();
-    const int      tile_rows                = iceildiv(output_shape.n_rows, output_tile_rows);
-    const int      tile_cols                = iceildiv(output_shape.n_cols, output_tile_cols);
-    const int      m                        = in_shape.n_batches * tile_rows * tile_cols;
-    const int      k                        = in_shape.n_channels;
-    const int      n                        = out_channels;
-    const int      input_matrix_row_stride  = in_shape.n_channels;
-    const int      kernel_matrix_row_stride = roundup(out_channels, n_block);
-    const int      output_matrix_row_stride = kernel_matrix_row_stride;
-    const unsigned n_gemms                  = batched_gemm_kernel->get_number_gemms();
-
-    batched_gemm_kernel->configure(n_gemms, m, k, n,
-                                   input_matrix_stride, input_matrix_row_stride,
-                                   kernel_matrix_stride, kernel_matrix_row_stride,
-                                   output_matrix_stride, output_matrix_row_stride,
-                                   reinterpret_cast<float *>(_input_workspace.buffer()),
-                                   reinterpret_cast<float *>(_kernel_storage.buffer()),
-                                   reinterpret_cast<float *>(_output_workspace.buffer()));
-
-    // Reorder the convoluted output to ACL's ordering NCHW
-    _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
-
-    _transform_input_kernel   = std::move(transform_input_kernel);
-    _transform_weights_kernel = std::move(transform_weights_kernel);
-    _transform_output_kernel  = std::move(transform_output_kernel);
-    _batched_gemm_kernel      = std::move(batched_gemm_kernel);
-}
-
-void NEWinogradLayer::run()
-{
-    _memory_group.acquire();
-    if(!_reshaped_kernel)
-    {
-        _reshaped_kernel = true;
-        _permute_weights.run();
-        NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
-    }
-    //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
-    _permute_input.run();
-
-    // Transform input tensor to the winograd domain
-    NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
-
-    //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
-    NEScheduler::get().schedule(_batched_gemm_kernel.get(), Window::DimX);
-
-    // Transform output tensor to the spatial domain
-    NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
-
-    // Reorder the convoluted output to ACL's ordering NCHW
-    _permute_output.run();
-    _memory_group.release();
-}
-
-Status NEWinogradLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, biases, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(validate_arguments(input, weights, biases, output, conv_info));
-
-    return Status{};
-}
-
-} // namespace arm_compute

diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index c6802f3..795c96c 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CPUUtils.h"
 
 #include <omp.h>
 
@@ -41,6 +42,7 @@
 OMPScheduler::OMPScheduler() // NOLINT
     : _num_threads(omp_get_max_threads())
 {
+    get_cpu_configuration(_cpu_info);
 }
 
 unsigned int OMPScheduler::num_threads() const
@@ -59,7 +61,7 @@
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
 
     ThreadInfo info;
-    info.cpu_info = _info;
+    info.cpu_info = &_cpu_info;
 
     const Window      &max_window     = kernel->window();
     const unsigned int num_iterations = max_window.num_iterations(split_dimension);
@@ -74,7 +76,7 @@
         #pragma omp parallel firstprivate(info) num_threads(info.num_threads)
         {
             const int tid  = omp_get_thread_num();
-            Window win     = max_window.split_window(split_dimension, tid, info.num_threads);
+            Window    win  = max_window.split_window(split_dimension, tid, info.num_threads);
             info.thread_id = tid;
             kernel->run(win, info);
         }

diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp
index 4540aea..d0b3bde 100644
--- a/src/runtime/OffsetLifetimeManager.cpp
+++ b/src/runtime/OffsetLifetimeManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,19 +58,24 @@
     ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
 
     // Update blob size
-    size_t max_group_size = std::accumulate(std::begin(_active_elements), std::end(_active_elements), static_cast<size_t>(0), [](size_t s, const Element & e)
+    size_t max_group_size = std::accumulate(std::begin(_free_blobs), std::end(_free_blobs), static_cast<size_t>(0), [](size_t s, const Blob & b)
     {
-        return s + e.size;
+        return s + b.max_size;
     });
     _blob = std::max(_blob, max_group_size);
 
     // Calculate group mappings
     auto &group_mappings = _active_group->mappings();
     size_t offset         = 0;
-    for(auto &e : _active_elements)
+    for(auto &free_blob : _free_blobs)
     {
-        group_mappings[e.handle] = offset;
-        offset += e.size;
+        for(auto &bound_element_id : free_blob.bound_elements)
+        {
+            ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements));
+            Element &bound_element               = _active_elements[bound_element_id];
+            group_mappings[bound_element.handle] = offset;
+        }
+        offset += free_blob.max_size;
         ARM_COMPUTE_ERROR_ON(offset > _blob);
     }
 }

diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp
index 42cc943..293241d 100644
--- a/src/runtime/PoolManager.cpp
+++ b/src/runtime/PoolManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -72,3 +72,10 @@
     // Update semaphore
     _sem = arm_compute::support::cpp14::make_unique<arm_compute::Semaphore>(_free_pools.size());
 }
+
+size_t PoolManager::num_pools() const
+{
+    std::lock_guard<arm_compute::Mutex> lock(_mtx);
+
+    return _free_pools.size() + _occupied_pools.size();
+}
\ No newline at end of file

diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp
index c5b8f33..b010a32 100644
--- a/src/runtime/SubTensor.cpp
+++ b/src/runtime/SubTensor.cpp

@@ -27,6 +27,11 @@
 
 using namespace arm_compute;
 
+SubTensor::SubTensor()
+    : _parent(nullptr), _info()
+{
+}
+
 SubTensor::SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent)
     : _parent(nullptr), _info()
 {

diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index a0d41b2..993a95b 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryRegion.h"
 #include "support/ToolchainSupport.h"
 
 #include <cstddef>
@@ -114,7 +115,7 @@
     ARM_COMPUTE_UNUSED(validate_subtensor_shape);
 
     // Copy pointer to buffer
-    _memory = Memory(allocator._memory.buffer());
+    _memory = Memory(allocator._memory.region());
 
     // Init tensor info with new dimensions
     size_t total_size = parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes();
@@ -126,22 +127,23 @@
 
 uint8_t *TensorAllocator::data() const
 {
-    return _memory.buffer();
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+    return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
 }
 
 void TensorAllocator::allocate()
 {
-    ARM_COMPUTE_ERROR_ON(_memory.buffer() != nullptr);
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+    ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() != nullptr);
+
     if(_associated_memory_group == nullptr)
     {
-        _memory = Memory(std::shared_ptr<uint8_t>(new uint8_t[info().total_size()](), [](uint8_t *ptr)
-        {
-            delete[] ptr;
-        }));
+        _memory = Memory(std::make_shared<MemoryRegion>(info().total_size()));
     }
     else
     {
-        _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(_memory.handle()), info().total_size());
+        _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(_memory.region()->handle()), info().total_size());
+        _memory.region()->set_size(info().total_size());
     }
     info().set_is_resizable(false);
 }
@@ -154,7 +156,8 @@
 
 arm_compute::Status TensorAllocator::import_memory(Memory memory)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(memory.buffer() == nullptr);
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(memory.region()->buffer() == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
     _memory = memory;
     info().set_is_resizable(false);
@@ -164,15 +167,17 @@
 
 void TensorAllocator::set_associated_memory_group(MemoryGroup *associated_memory_group)
 {
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
     ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
     ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
-    ARM_COMPUTE_ERROR_ON(_memory.buffer() != nullptr);
+    ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() != nullptr);
     _associated_memory_group = associated_memory_group;
 }
 
 uint8_t *TensorAllocator::lock()
 {
-    return _memory.buffer();
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
+    return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
 }
 
 void TensorAllocator::unlock()
commit	b3a371bc429d2ba45e56baaf239d8200c2662a74	[log] [tgz]
author	Jenkins <bsgcomp@arm.com>	Wed May 23 11:36:53 2018 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	Wed May 23 14:55:11 2018 +0100
tree	554525e415c303d64a08722a755397852ebbb8e4
parent	67c8c91522e5be8156b77f57e63c0253535c902a [diff]